In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
df = pd.read_csv("clean.csv")

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,body,score,target
0,3,"Hiethcliff, Garfield..?",1,0
1,6,That looks like a Garfield,1,0
2,7,Tell it to pick up some weights or gloves… no ...,1,0
3,8,Please put him on a diet now or he will die wa...,1,0
4,10,I bet this dad never agreed to have a dog in t...,1,0


In [16]:
df.drop(df.columns[[0]], axis = 1, inplace = True)

In [17]:
df.head()

Unnamed: 0,body,score,target
0,"Hiethcliff, Garfield..?",1,0
1,That looks like a Garfield,1,0
2,Tell it to pick up some weights or gloves… no ...,1,0
3,Please put him on a diet now or he will die wa...,1,0
4,I bet this dad never agreed to have a dog in t...,1,0


In [47]:
df.isnull().sum()

body      0
score     0
target    0
dtype: int64

In [21]:
features = ['body', 'score']
X = df[features]
y = df.target

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.50,
                                                    stratify=y)

In [38]:
X_train.shape, X_test.shape

((6730, 2), (6730, 2))

In [52]:
X_test.isnull().sum()

body     0
score    0
dtype: int64

In [24]:
cv_body = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 6), min_df=.03)
X_train_body = cv_body.fit_transform(X_train.body)
X_test_body = cv_body.transform(X_test.body)

In [26]:
X_train_body_df = pd.DataFrame(X_train_body.todense(), columns=[x+'_body' for x in cv_body.get_feature_names_out()])
X_test_body_df = pd.DataFrame(X_test_body.todense(), columns=[x+'_body' for x in cv_body.get_feature_names_out()])

In [40]:
X_train_body_df.shape, X_test_body_df.shape

((6730, 144), (6730, 144))

In [32]:
X_test_body_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,trying_body,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
X_test_body_df.isnull().sum()

action_body                                               0
action performed_body                                     0
action performed automatically_body                       0
action performed automatically contact_body               0
action performed automatically contact moderators_body    0
                                                         ..
work_body                                                 0
www_body                                                  0
www reddit_body                                           0
www reddit com_body                                       0
years_body                                                0
Length: 144, dtype: int64

In [59]:
X_train.score.shape

(6730,)

In [61]:
vector_train_df = pd.concat([X_train_body_df.reset_index(drop=True), X_train.score.reset_index(drop=True)], axis=1)
vector_test_df = pd.concat([X_test_body_df.reset_index(drop=True), X_test.score.reset_index(drop=True)], axis=  1)

In [66]:
vector_train_df.shape, vector_test_df.shape

((6730, 145), (6730, 145))

In [67]:
vector_train_df.isnull().sum().sum(), vector_test_df.isnull().sum().sum()

(0, 0)

In [64]:
vector_train_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body,score
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,3,0,0,...,0,3,0,0,0,0,0,0,0,1


In [68]:
vector_test_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body,score
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,51
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [74]:
def lr_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'lr_1':
        
        lr_1_params = {
            'penalty': ['l1'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],
            'random_state': [42],
            'solver': ['liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_1_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'lr_2':
        
        lr_2_params = {
            'penalty': ['l2'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],            
            'random_state': [42],
            'solver': ['lbfgs', 'liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_2_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [75]:
lr_models('lr_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train score = 0.5799405646359584
Test score = 0.5594353640416048
--------
[[1547  349   81]
 [ 863 1065  295]
 [ 890  487 1153]]
Best params = {'C': 2, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear', 'warm_start': True}


In [76]:
lr_models('lr_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Train score = 0.5790490341753344
Test score = 0.5592867756315008
--------
[[1543  349   85]
 [ 857 1062  304]
 [ 884  487 1159]]
Best params = {'C': 1.5, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'warm_start': True}


In [78]:
def dt_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'dt_1':
        
        dt_params = {
            'criterion': ['gini'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
   
    elif model == 'dt_2':
        
        dt_none_params = {
            'criterion': ['entropy'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [79]:
dt_models('dt_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train score = 0.6292719167904903
Test score = 0.5197622585438336
--------
[[1415  321  241]
 [ 886  909  428]
 [ 832  524 1174]]
Best params = {'criterion': 'gini', 'max_depth': 24, 'max_features': None, 'min_samples_split': 14, 'random_state': 42}


In [80]:
dt_models('dt_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train score = 0.6230312035661218
Test score = 0.512927191679049
--------
[[1558  302  117]
 [1006  820  397]
 [ 996  460 1074]]
Best params = {'criterion': 'entropy', 'max_depth': 24, 'max_features': 0.4, 'min_samples_split': 11, 'random_state': 42}


In [81]:
def rf_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'rf_1':
        
        rf_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['gini'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto', .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'rf_2':
        
        rf_none_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['entropy'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto',  .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [82]:
rf_models('rf_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Train score = 0.6698365527488855
Test score = 0.5533432392273403
--------
[[1520  355  102]
 [ 862 1061  300]
 [ 870  517 1143]]
Best params = {'bootstrap': True, 'criterion': 'gini', 'max_depth': 21, 'max_features': 0.1, 'min_samples_split': 5, 'n_estimators': 30, 'random_state': 42, 'warm_start': True}


In [83]:
rf_models('rf_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Train score = 0.6239227340267459
Test score = 0.5514115898959882
--------
[[1531  366   80]
 [ 866 1081  276]
 [ 892  539 1099]]
Best params = {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 21, 'max_features': 'auto', 'min_samples_split': 25, 'n_estimators': 30, 'random_state': 42, 'warm_start': True}
