In [1]:
import pickle
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import re
import string
from string import punctuation
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

import matplotlib.pyplot as plt

tqdm.pandas()

In [8]:
videos = {
    "Psy": "9bZkp7q19f0",
    "KatyPerry": "CevxZvSJLk8",
    "LMFAO": "KQ6zr6kCPj8",
    "Eminem": "uelHwf8o7_U",
    "Shakira": "pRpeEdMmmQ0"
}

In [9]:
all_comments = pd.DataFrame()
for filename in os.listdir("media"):
    artist = filename.split(".")[0].split("-")[-1]
    if artist not in videos:
        print(filename)
        continue
    else:
        video_id = videos[artist]
    df = pd.read_csv(f"media/{filename}")
    all_comments = pd.concat([all_comments, df])

youtube_comments_20120117.csv


In [10]:
df = pd.read_csv("media/youtube_comments_20120117.csv", header=None, encoding='utf-8')

In [11]:
all_comments.columns

Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')

In [12]:
df.columns = ["COMMENT_ID", "VIDEO_ID", "AUTHOR", "CONTENT", "CLASS"]

In [17]:
sm_comments = all_comments[["CONTENT", "CLASS"]]

In [15]:
comments = pd.concat([df[["CONTENT", "CLASS"]], all_comments[["CONTENT", "CLASS"]]])
comments.sample(5)

Unnamed: 0,CONTENT,CLASS
6282629,The video wouldn't have been that bad but the ...,0
1452731,can't wait.\n,0
5714917,damn Wiese lost...Ôªø LETS FLIP A CAR!,0
3410018,what is this a parodyÔªø of?,0
2912058,piano musicÔªø bits are from treyarch zombies?,0


In [18]:
print(sm_comments.shape)
sm_comments.drop_duplicates(inplace=True)
sm_comments.shape

(1956, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_comments.drop_duplicates(inplace=True)


(1760, 2)

In [19]:
print(comments.shape)
comments.drop_duplicates(inplace=True)
comments.shape

(6057689, 2)


(6057689, 2)

## Data Cleaning and Feature Building

In [20]:
def has_url(text):
    return bool(re.search(r'((https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b|watch\?v)', text))

def starts_with_punc(text):
    text = text.strip()
    start_char = text[0]
    i = 1
    while i < len(text):
        if text[i] == start_char:
            i += 1
        else:
            break
    return start_char in punctuation and i > 2

def all_caps(text):
    return text.strip().isupper()

def clean(text):

    text = text.lower()
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@\S+', '', text)
    text = ''.join([i for i in text if not i.isdigit()])
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\\n', ' ', text)
    text = ''.join(c for c in text if c not in punctuation)
    
    return text

In [21]:
comments["HAS_URL"] = comments.CONTENT.progress_apply(has_url).astype(int)
sm_comments["HAS_URL"] = sm_comments.CONTENT.progress_apply(has_url).astype(int)

  0%|          | 0/6057689 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_comments["HAS_URL"] = sm_comments.CONTENT.progress_apply(has_url).astype(int)


In [22]:
comments["CAPS"] = comments.CONTENT.progress_apply(all_caps).astype(int)
sm_comments["CAPS"] = sm_comments.CONTENT.progress_apply(all_caps).astype(int)

  0%|          | 0/6057689 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_comments["CAPS"] = sm_comments.CONTENT.progress_apply(all_caps).astype(int)


In [23]:
comments["PUNC"] = comments.CONTENT.progress_apply(starts_with_punc).astype(int)
sm_comments["PUNC"] = sm_comments.CONTENT.progress_apply(starts_with_punc).astype(int)

  0%|          | 0/6057689 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_comments["PUNC"] = sm_comments.CONTENT.progress_apply(starts_with_punc).astype(int)


In [24]:
comments["CLEAN_CONTENT"] = comments.CONTENT.progress_apply(clean)
sm_comments["CLEAN_CONTENT"] = sm_comments.CONTENT.progress_apply(clean)

  0%|          | 0/6057689 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_comments["CLEAN_CONTENT"] = sm_comments.CONTENT.progress_apply(clean)


In [25]:
comments.sample(10)

Unnamed: 0,CONTENT,CLASS,HAS_URL,CAPS,PUNC,CLEAN_CONTENT
4396425,You guys realize this is a bud light commercial.Ôªø,0,0,0,0,you guys realize this is a bud light commercialÔªø
9164,"ele falou , pr√≥xima semana , ai cortaram",0,0,0,0,ele falou pr√≥xima semana ai cortaram
4697217,My favourite literal so farÔªø =),0,0,0,0,my favourite literal so farÔªø
5353387,"""I can see it in your eyes that you really wan...",0,0,0,0,i can see it in your eyes that you really want...
5272948,moreÔªø skyblock,0,0,0,0,moreÔªø skyblock
3031408,@user1556274 \nXbox\npc have so many bugs you ...,0,0,0,0,xbox pc have so many bugs you have to fix yo...
148464,I'd really like that copy =),0,0,0,0,id really like that copy
1831517,i like to barrel roll on fridays,0,0,0,0,i like to barrel roll on fridays
2204303,His eye is yellow... Did you get beat up? :(,0,0,0,0,his eye is yellow did you get beat up
5532232,@user384421 its Gerudoku 32xÔªø Textures,0,0,0,0,its gerudoku xÔªø textures


In [26]:
sm_comments.sample(10)

Unnamed: 0,CONTENT,CLASS,HAS_URL,CAPS,PUNC,CLEAN_CONTENT
189,How To Make A Lot Of Money Fast,1,0,0,0,how to make a lot of money fast
68,"I loved, she is amazing.. OMG your eyes*_*Ôªø",0,0,0,0,i loved she is amazing omg your eyesÔªø
86,music yeahÔªø,0,0,0,0,music yeahÔªø
190,Have you tried a new social network TSU? This ...,1,1,0,0,have you tried a new social network tsu this n...
28,I dont even watch it anymore i just come here ...,0,0,0,0,i dont even watch it anymore i just come here ...
317,please subscribe to my page. thanks.,1,0,0,0,please subscribe to my page thanks
126,Thumbs up if shrek is gay üëçÔªø,1,0,0,0,thumbs up if shrek is gay üëçÔªø
127,Shakira is different :) She is so happy all th...,0,0,0,0,shakira is different she is so happy all the ...
287,the best!,0,0,0,0,the best
346,Thumbs up if you listen this in 2015.Ôªø,0,0,0,0,thumbs up if you listen this in Ôªø


In [28]:
clean_comments = comments[comments.CLEAN_CONTENT.str.strip().str.len() > 1]
clean_comments = clean_comments[~clean_comments.CLEAN_CONTENT.isna()]
clean_comments.shape

(6038868, 6)

In [27]:
sm_clean_comments = sm_comments[sm_comments.CLEAN_CONTENT.str.strip().str.len() > 1]
sm_clean_comments = sm_clean_comments[~sm_clean_comments.CLEAN_CONTENT.isna()]
sm_clean_comments.shape

(1729, 6)

In [29]:
clean_comments.reset_index(drop=True, inplace=True)
clean_comments.sample(10)

Unnamed: 0,CONTENT,CLASS,HAS_URL,CAPS,PUNC,CLEAN_CONTENT
2009658,@user146516 An absolute yes to both of the que...,0,0,0,0,an absolute yes to both of the questions shes...
5966442,@user591317 8==DÔªø,0,0,0,0,dÔªø
1708378,"Also, take gay out of it, literally: a (GAY) p...",0,0,0,0,also take gay out of it literally a gay person...
922516,YOUR ONLY SUPPOSE TO BLOW THE BLOODY DOORS OFF!,0,0,1,0,your only suppose to blow the bloody doors off
2826531,ŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáÿ£ÿ£ÿ£Ÿä ÿßŸÑŸÑŸá Ÿäÿ≥ÿπÿØŸÉ ÿØŸÜŸäÿß ŸàÿßÿÆÿ±Ÿá Ÿäÿßÿ®Ÿà ...,0,0,0,0,ŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáÿ£ÿ£ÿ£Ÿä ÿßŸÑŸÑŸá Ÿäÿ≥ÿπÿØŸÉ ÿØŸÜŸäÿß ŸàÿßÿÆÿ±Ÿá Ÿäÿßÿ®Ÿà ŸÖ...
3700978,@user885333 imÔªø going for you,0,0,0,0,imÔªø going for you
712063,"Hola Megan, soy de Espa√±a y me encanta como ca...",0,0,0,0,hola megan soy de espa√±a y me encanta como can...
1069127,@user704253 WTF BITCH SHE NOT FUCKING KESHA GO...,0,0,0,0,wtf bitch she not fucking kesha go fix your e...
4124485,how i met yourÔªø mother!,0,0,0,0,how i met yourÔªø mother
3591327,I can't believe IÔªø just watched a video on peo...,0,0,0,0,i cant believe iÔªø just watched a video on peop...


In [30]:
sm_clean_comments.reset_index(drop=True, inplace=True)
sm_clean_comments.sample(10)

Unnamed: 0,CONTENT,CLASS,HAS_URL,CAPS,PUNC,CLEAN_CONTENT
268,Most viewed video on youtube...daaaaaaaaaaannn...,0,0,0,0,most viewed video on youtubedaaaaaaaaaaannng t...
1591,Please visit this Website: oldchat.tk,1,0,0,0,please visit this website oldchattk
1290,I hope everyone is in good spirits I&#39;m a h...,1,0,0,0,i hope everyone is in good spirits im a hard w...
1509,ShakiraÔªø,0,0,0,0,shakiraÔªø
457,It is a shitÔªø,0,0,0,0,it is a shitÔªø
1050,Subscribe me Secret videos :DÔªø,1,0,0,0,subscribe me secret videos dÔªø
1363,‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏èÔªø,0,0,0,0,‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏èÔªø
931,"<a href=""https://m.freemyapps.com/share/url/10...",1,1,0,0,a hrefaÔªø
1227,"I love this-the talents of eminem and Skylar,w...",0,0,0,0,i love thisthe talents of eminem and skylarwor...
829,Thumbs up if you&#39;re watching in 2015Ôªø,0,0,0,0,thumbs up if youre watching in Ôªø


## Data Exploration

In [31]:
## Saving Cleaned Data to Save Cleaning Time
clean_comments.to_csv('clean_comments.csv', index=False)
sm_clean_comments.to_csv('sm_clean_comments.csv', index=False)

In [None]:
clean_comments = pd.read_csv('clean_comments.csv')

In [3]:
# Checking for imbalance in dataset
spam_comments = clean_comments[clean_comments.CLASS == 1]
ham_comments = clean_comments[clean_comments.CLASS == 0]
spam_comments.shape[0]/clean_comments.shape[0], ham_comments.shape[0]/clean_comments.shape[0]

(0.05548423976149172, 0.9445157602385083)

In [4]:
# Percentage of comments with urls
print("Spam Comment URL %:", spam_comments.HAS_URL.sum()/spam_comments.shape[0]*100)
print("Ham Comment URL %:", ham_comments.HAS_URL.sum()/ham_comments.shape[0]*100)
# Percentage of comments with all uppercase words
print("Spam Comment all uppercase words %:", spam_comments.CAPS.sum()/spam_comments.shape[0]*100)
print("Ham Comment all uppercase words %:", ham_comments.CAPS.sum()/ham_comments.shape[0]*100)
# Percentage of comments that begin with punctuations
print("Spam Comment that begin with punctuations %:", spam_comments.PUNC.sum()/spam_comments.shape[0]*100)
print("Ham Comment that begin with punctuations %:", ham_comments.PUNC.sum()/ham_comments.shape[0]*100)

Spam Comment URL %: 4.12431132148677
Ham Comment URL %: 0.35911109178678235
Spam Comment all uppercase words %: 5.517486315965404
Ham Comment all uppercase words %: 4.262417059766759
Spam Comment that begin with punctuations %: 0.584667912207293
Ham Comment that begin with punctuations %: 0.17430466604228825


## Checkpoint

In [2]:
sm_clean_comments = pd.read_csv('sm_clean_comments.csv')

In [5]:
# Checking for imbalance in dataset
sm_spam_comments = sm_clean_comments[sm_clean_comments.CLASS == 1]
sm_ham_comments = sm_clean_comments[sm_clean_comments.CLASS == 0]
sm_spam_comments.shape[0]/sm_clean_comments.shape[0], sm_ham_comments.shape[0]/sm_clean_comments.shape[0]

(0.47137073452862926, 0.5286292654713707)

In [6]:
# Percentage of comments with urls
print("Spam Comment URL %:", sm_spam_comments.HAS_URL.sum()/sm_spam_comments.shape[0]*100)
print("Ham Comment URL %:", sm_ham_comments.HAS_URL.sum()/sm_ham_comments.shape[0]*100)
# Percentage of comments with all uppercase words
print("Spam Comment all uppercase words %:", sm_spam_comments.CAPS.sum()/sm_spam_comments.shape[0]*100)
print("Ham Comment all uppercase words %:", sm_ham_comments.CAPS.sum()/sm_ham_comments.shape[0]*100)
# Percentage of comments that begin with punctuations
print("Spam Comment that begin with punctuations %:", sm_spam_comments.PUNC.sum()/sm_spam_comments.shape[0]*100)
print("Ham Comment that begin with punctuations %:", sm_ham_comments.PUNC.sum()/sm_ham_comments.shape[0]*100)

Spam Comment URL %: 21.595092024539877
Ham Comment URL %: 0.87527352297593
Spam Comment all uppercase words %: 5.521472392638037
Ham Comment all uppercase words %: 5.470459518599562
Spam Comment that begin with punctuations %: 0.6134969325153374
Ham Comment that begin with punctuations %: 0.10940919037199125


# Model Building

In [8]:
X = sm_clean_comments[["CONTENT"]]
y = sm_clean_comments[['CLASS']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)

In [9]:
X_train_tfidf = vectorizer.fit_transform(X_train["CONTENT"])
X_test_tfidf = vectorizer.transform(X_test["CONTENT"])

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Multinomial NB",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    MultinomialNB(),
]


# iterate over classifiers
for name, clf in zip(names, classifiers):
    print(f"Model: {name}")
    try:
        clf.fit(X_train_tfidf, y_train.CLASS)
        score = clf.score(X_test_tfidf, y_test.CLASS)
    except:
        clf.fit(X_train_tfidf.toarray(), y_train.CLASS)
        score = clf.score(X_test_tfidf.toarray(), y_test.CLASS)
    print(f"Score: {score}")

Model: Nearest Neighbors
Score: 0.6647398843930635
Model: Linear SVM
Score: 0.6069364161849711
Model: RBF SVM
Score: 0.9479768786127167
Model: Decision Tree
Score: 0.8988439306358381
Model: Random Forest
Score: 0.6994219653179191
Model: Neural Net
Score: 0.9508670520231214
Model: AdaBoost
Score: 0.9190751445086706
Model: Naive Bayes
Score: 0.8179190751445087
Model: QDA




Score: 0.6734104046242775
Model: Multinomial NB
Score: 0.9335260115606936


In [57]:
%%time
model = MLPClassifier(alpha=0.001, max_iter=1000, hidden_layer_sizes=(100, 100,), batch_size=16, early_stopping=True, verbose=True)
model.fit(X_train_tfidf, y_train.CLASS)
predictions = model.predict(X_test_tfidf)

Iteration 1, loss = 0.62638644
Validation score: 0.892086
Iteration 2, loss = 0.38017041
Validation score: 0.899281
Iteration 3, loss = 0.19453473
Validation score: 0.906475
Iteration 4, loss = 0.11637790
Validation score: 0.928058
Iteration 5, loss = 0.08091238
Validation score: 0.920863
Iteration 6, loss = 0.06184985
Validation score: 0.928058
Iteration 7, loss = 0.04945354
Validation score: 0.935252
Iteration 8, loss = 0.04166619
Validation score: 0.935252
Iteration 9, loss = 0.03668759
Validation score: 0.942446
Iteration 10, loss = 0.03291888
Validation score: 0.928058
Iteration 11, loss = 0.03027196
Validation score: 0.920863
Iteration 12, loss = 0.02819092
Validation score: 0.920863
Iteration 13, loss = 0.02631595
Validation score: 0.920863
Iteration 14, loss = 0.02500090
Validation score: 0.920863
Iteration 15, loss = 0.02396628
Validation score: 0.920863
Iteration 16, loss = 0.02317418
Validation score: 0.920863
Iteration 17, loss = 0.02249708
Validation score: 0.920863
Iterat

In [58]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[168  10]
 [ 10 158]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       178
           1       0.94      0.94      0.94       168

    accuracy                           0.94       346
   macro avg       0.94      0.94      0.94       346
weighted avg       0.94      0.94      0.94       346



In [None]:
X = clean_comments[["CONTENT"]]
y = clean_comments[['CLASS']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)

## AS-IS

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train["CONTENT"])
X_test_tfidf = vectorizer.transform(X_test["CONTENT"])

In [179]:
%%time
model = SGDClassifier(alpha=.1, loss='perceptron', n_jobs=-1, random_state=0)
model.fit(X_train_tfidf, y_train.CLASS)
predictions = model.predict(X_test_tfidf)

CPU times: total: 22.6 s
Wall time: 22.5 s


In [180]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[1100962   43503]
 [  45355   21718]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96   1144465
           1       0.33      0.32      0.33     67073

    accuracy                           0.93   1211538
   macro avg       0.65      0.64      0.64   1211538
weighted avg       0.93      0.93      0.93   1211538



## Class Balanced Classification

In [181]:
%%time
model = SGDClassifier(alpha=.1, loss='perceptron', n_jobs=-1, random_state=0, class_weight="balanced")
model.fit(X_train_tfidf, y_train.CLASS)
predictions = model.predict(X_test_tfidf)

CPU times: total: 24.7 s
Wall time: 24.7 s


In [182]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[821834 322631]
 [ 25125  41948]]
              precision    recall  f1-score   support

           0       0.97      0.72      0.83   1144465
           1       0.12      0.63      0.19     67073

    accuracy                           0.71   1211538
   macro avg       0.54      0.67      0.51   1211538
weighted avg       0.92      0.71      0.79   1211538



## Over-Sampling

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X.values.reshape(-1, 1), y.values.reshape(-1))
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size=0.2)
X_train_over = X_train_over.reshape(-1)
X_test_over = X_test_over.reshape(-1)
X_train_over_tfidf = vectorizer.fit_transform(X_train_over)
X_test_over_tfidf = vectorizer.transform(X_test_over)

In [None]:
%%time
model = SGDClassifier(alpha=.1, loss='perceptron', n_jobs=-1, random_state=0)
model.fit(X_train_over_tfidf, y_train_over)
predictions = model.predict(X_test_over_tfidf)

In [None]:
print(confusion_matrix(y_test_over, predictions))
print(classification_report(y_test_over, predictions))

## Under-Sampling

In [183]:
undersample = RandomUnderSampler()
X_under, y_under = undersample.fit_resample(X.values.reshape(-1, 1), y.values.reshape(-1))
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, test_size=0.2)
X_train_under = X_train_under.reshape(-1)
X_test_under = X_test_under.reshape(-1)
X_train_under_tfidf = vectorizer.fit_transform(X_train_under)
X_test_under_tfidf = vectorizer.transform(X_test_under)

In [184]:
%%time
model = SGDClassifier(alpha=.1, loss='perceptron', n_jobs=-1, random_state=0)
model.fit(X_train_under_tfidf, y_train_under)
predictions = model.predict(X_test_under_tfidf)

CPU times: total: 2.33 s
Wall time: 2.32 s


In [185]:
confusion_matrix(y_test_under, predictions)
print(classification_report(y_test_under, predictions))

              precision    recall  f1-score   support

           0       0.67      0.68      0.68     67400
           1       0.67      0.65      0.66     66776

    accuracy                           0.67    134176
   macro avg       0.67      0.67      0.67    134176
weighted avg       0.67      0.67      0.67    134176



# Final Model

In [187]:
X_train = X_train_over
y_train = y_train_over
X_test = X_test_over
y_test = y_test_over

In [188]:
model = SGDClassifier(loss='perceptron', n_jobs=-1, random_state=0)
params = {
    'tfidf__min_df': [1, 2],
    'tfidf__max_df': [0.5, 0.95],
    'model__alpha': [0.1, 1, 10],
}
best_params = search_para(X_train, y_train, model=model, params=params)
best_params

Fitting 3 folds for each of 12 candidates, totalling 36 fits


{'model__alpha': 1, 'tfidf__max_df': 0.5, 'tfidf__min_df': 1}

In [189]:
%%time
max_df = best_params["tfidf__max_df"]
min_df = best_params["tfidf__min_df"]
vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True, max_df=max_df, min_df=min_df)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

CPU times: total: 266 ms
Wall time: 272 ms


In [190]:
alpha = best_params["model__alpha"]
model = SGDClassifier(alpha=alpha, loss='perceptron', n_jobs=-1, random_state=0)
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)

In [191]:
print(confusion_matrix(y_test_over, predictions))
print(classification_report(y_test_over, predictions))

[[178  10]
 [ 20 183]]
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       188
           1       0.95      0.90      0.92       203

    accuracy                           0.92       391
   macro avg       0.92      0.92      0.92       391
weighted avg       0.92      0.92      0.92       391

