In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

In [68]:
df = pd.read_csv("clean.csv")

In [69]:
df.head()

Unnamed: 0.1,Unnamed: 0,body,score,target
0,3,"Hiethcliff, Garfield..?",1,0
1,6,That looks like a Garfield,1,0
2,7,Tell it to pick up some weights or gloves… no ...,1,0
3,8,Please put him on a diet now or he will die wa...,1,0
4,10,I bet this dad never agreed to have a dog in t...,1,0


In [70]:
df.drop(df.columns[[0]], axis = 1, inplace = True)

In [71]:
df.head()

Unnamed: 0,body,score,target
0,"Hiethcliff, Garfield..?",1,0
1,That looks like a Garfield,1,0
2,Tell it to pick up some weights or gloves… no ...,1,0
3,Please put him on a diet now or he will die wa...,1,0
4,I bet this dad never agreed to have a dog in t...,1,0


In [72]:
df.isnull().sum()

body      0
score     0
target    0
dtype: int64

In [73]:
features = ['body', 'score']
X = df[features]
y = df['target']

In [74]:
y = LabelEncoder().fit_transform(y)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.25,
                                                    stratify=y)

In [76]:
X_train.shape, X_test.shape

((10095, 2), (3365, 2))

In [77]:
X_test.isnull().sum()

body     0
score    0
dtype: int64

In [78]:
cv_body = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 6), min_df=.03)
X_train_body = cv_body.fit_transform(X_train.body)
X_test_body = cv_body.transform(X_test.body)

In [79]:
X_train_body_df = pd.DataFrame(X_train_body.todense(), columns=[x+'_body' for x in cv_body.get_feature_names_out()])
X_test_body_df = pd.DataFrame(X_test_body.todense(), columns=[x+'_body' for x in cv_body.get_feature_names_out()])

In [80]:
X_train_body_df.shape, X_test_body_df.shape

((10095, 142), (3365, 142))

In [81]:
X_test_body_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,try_body,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [82]:
X_test_body_df.isnull().sum()

action_body                                               0
action performed_body                                     0
action performed automatically_body                       0
action performed automatically contact_body               0
action performed automatically contact moderators_body    0
                                                         ..
work_body                                                 0
www_body                                                  0
www reddit_body                                           0
www reddit com_body                                       0
years_body                                                0
Length: 142, dtype: int64

In [83]:
X_train.score.shape

(10095,)

In [84]:
vector_train_df = pd.concat([X_train_body_df.reset_index(drop=True), X_train.score.reset_index(drop=True)], axis=1)
vector_test_df = pd.concat([X_test_body_df.reset_index(drop=True), X_test.score.reset_index(drop=True)], axis=  1)

In [85]:
vector_train_df.shape, vector_test_df.shape

((10095, 143), (3365, 143))

In [86]:
vector_train_df.isnull().sum().sum(), vector_test_df.isnull().sum().sum()

(0, 0)

In [87]:
vector_train_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body,score
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,12
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [88]:
vector_test_df.head()

Unnamed: 0,action_body,action performed_body,action performed automatically_body,action performed automatically contact_body,action performed automatically contact moderators_body,action performed automatically contact moderators subreddit_body,advice_body,anxiety_body,approved_body,automatically_body,...,use_body,ve_body,want_body,way_body,work_body,www_body,www reddit_body,www reddit com_body,years_body,score
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [95]:
def lr_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'lr_1':
        
        lr_1_params = {
            'penalty': ['l1'],
            'multi_class':['multinomial'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],
            'random_state': [42],
            'solver': ['saga']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_1_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'lr_2':
        
        lr_2_params = {
            'penalty': ['l2'],
            'multi_class':['multinomial'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],            
            'random_state': [42],
            'solver': ['saga', 'sag', 'newton-cg','lbfgs']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_2_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [96]:
lr_models('lr_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train score = 0.4881624566617137
Test score = 0.46924219910846954
--------
[[611 291  86]
 [449 610  53]
 [486 421 358]]
Best params = {'C': 1, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l1', 'random_state': 42, 'solver': 'saga', 'warm_start': True}




In [97]:
lr_models('lr_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Train score = 0.5682020802377414
Test score = 0.562258543833581
--------
[[808 142  38]
 [472 528 112]
 [480 229 556]]
Best params = {'C': 1, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l2', 'random_state': 42, 'solver': 'newton-cg', 'warm_start': True}


In [98]:
def dt_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'dt_1':
        
        dt_params = {
            'criterion': ['gini'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
   
    elif model == 'dt_2':
        
        dt_none_params = {
            'criterion': ['entropy'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [99]:
dt_models('dt_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train score = 0.6205052005943537
Test score = 0.5280832095096583
--------
[[538 158 292]
 [259 450 403]
 [222 254 789]]
Best params = {'criterion': 'gini', 'max_depth': 24, 'max_features': 0.5, 'min_samples_split': 11, 'random_state': 42}


In [100]:
dt_models('dt_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train score = 0.6154531946508173
Test score = 0.5263001485884101
--------
[[579 155 254]
 [299 431 382]
 [258 246 761]]
Best params = {'criterion': 'entropy', 'max_depth': 24, 'max_features': 0.4, 'min_samples_split': 11, 'random_state': 42}


In [101]:
def rf_models(model, X_train, X_test, y_train, y_test):
    
    if model == 'rf_1':
        
        rf_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['gini'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto', .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'rf_2':
        
        rf_none_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['entropy'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto',  .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    else:
        print('There is an error.')
        
    M.fit(X_train.values, y_train)
     
    print(f'Train score = {M.score(X_train.values, y_train)}')
    print(f'Test score = {M.score(X_test.values, y_test)}')
    
    predictions = M.predict(X_test.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [102]:
rf_models('rf_1', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Train score = 0.6596334819217434
Test score = 0.550074294205052
--------
[[717 186  85]
 [404 528 180]
 [406 253 606]]
Best params = {'bootstrap': False, 'criterion': 'gini', 'max_depth': 21, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 30, 'random_state': 42, 'warm_start': True}


In [103]:
rf_models('rf_2', vector_train_df, vector_test_df, y_train, y_test)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Train score = 0.7267954432887568
Test score = 0.5560178306092125
--------
[[628 207 153]
 [316 557 239]
 [285 294 686]]
Best params = {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 25, 'n_estimators': 30, 'random_state': 42, 'warm_start': True}
