# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /Users/meng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/meng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
        
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    words = word_tokenize(text)
    
    words = [w for w in words if w not in stopwords.words('english')]
    
    lemmatizer = WordNetLemmatizer()
    
    lemmed = [lemmatizer.lemmatize(w) for w in words]
    lemmed = [lemmatizer.lemmatize(w, pos='v') for w in lemmed]

    return lemmed

### Step 1: Load data and perform a train test split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

In [9]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer(smooth_idf=False)
clf = RandomForestClassifier()

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)

RandomForestClassifier()

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [10]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_pred = clf.predict(X_test_tfidf) 

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [16]:
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

In [17]:
labels = np.unique(y_test)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = accuracy_score(y_test, y_pred)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 93   0  25]
 [  0  22   6]
 [  8   0 447]]
Accuracy: 0.9351081530782029


# Final Step: Refactor
Organize these steps into the following functions.

In [19]:
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

[nltk_data] Downloading package punkt to /Users/meng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/meng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
def display_results(y_test, y_pred):
    labels = np.unique(y_test)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = accuracy_score(y_test, y_pred)

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # load data
    X, y = load_data()

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Instantiate transformers and classifier
    #vect = CountVectorizer(tokenizer=tokenize)
    #tfidf = TfidfTransformer(smooth_idf=False)
    tfidf = TfidfVectorizer()
    clf = RandomForestClassifier()

    # Fit and/or transform each to the data
    #X_train_counts = vect.fit_transform(X_train)
    #X_train_tfidf = tfidf.fit_transform(X_train_counts)
    X_train_tfidf = tfidf.fit_transform(X_train)
    clf.fit(X_train_tfidf, y_train)

    # Transform test data
    #X_test_counts = vect.transform(X_test)
    #X_test_tfidf = tfidf.transform(X_test_counts)
    X_test_tfidf = tfidf.transform(X_test)

    # Predict test labels
    y_pred = clf.predict(X_test_tfidf) 
    
    # Display results
    display_results(y_test, y_pred)

In [24]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 82   0  23]
 [  0  27   7]
 [  6   1 455]]
Accuracy: 0.9384359400998337
