In [32]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def models(X, y, option = 0, val_size=0.2, random_state=0):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=random_state)

    if option == 0:
    
        params = {
        'model' : KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)
        
        

    elif option == 1:
        params = {
            'model': DecisionTreeClassifier(),
            'params' : {
                'criterion': ['gini', 'entropy'],
                'splitter': ['best', 'random'],
                'max_depth': [None, 10, 20, 30, 40, 50],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': [None, 'auto', 'sqrt', 'log2']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)

    else:
        params = {
        'model' : LogisticRegression(),
        'params' : {
            'penalty': ['l1', 'l2', 'elasticnet'],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'max_iter': [100, 200, 500, 1000],
            'class_weight': [None, 'balanced']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    # Report
    print(f'Report of train set with {int(len(X_train))} examples: ')
    print(classification_report(y_train, y_pred_train))
    print('\nReport of validation set:')
    print(classification_report(y_val, y_pred_val))

    return model

def cat_models(X, y, option = 0, val_size=0.2, random_state=0):
    pipeline = Pipeline([('count',CountVectorizer()),('tfidf',TfidfTransformer())])

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=random_state)
    X_train = pipeline.fit_transform(X_train).toarray()
    X_val = pipeline.transform(X_val).toarray()


    if option == 0:
    
        params = {
        'model' : KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)
        
        

    elif option == 1:
        params = {
            'model': DecisionTreeClassifier(),
            'params' : {
                'criterion': ['gini', 'entropy'],
                'splitter': ['best', 'random'],
                'max_depth': [None, 10, 20, 30, 40, 50],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': [None, 'auto', 'sqrt', 'log2']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)

    else:
        params = {
        'model' : LogisticRegression(),
        'params' : {
            'penalty': ['l1', 'l2', 'elasticnet'],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'max_iter': [100, 200, 500, 1000],
            'class_weight': [None, 'balanced']
            }
        }
        model = find_best_model(params["model"], params["params"], X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    # Report
    print(f'Report of train set with {int(len(X_train))} examples: ')
    print(classification_report(y_train, y_pred_train))
    print('\nReport of validation set:')
    print(classification_report(y_val, y_pred_val))

    return model, pipeline

def predict(model, X, y, name_set='test set'):
    start = time.time()
    y_pred = model.predict(X)
    stop = time.time()

    print(f'Time to predict on the {name_set} with {len(X)} examples: {round(stop-start, 4)}s')
    print("Precision:", precision_score(y, y_pred))
    print("Recall:", recall_score(y, y_pred))
    print("F1-score:", f1_score(y, y_pred))
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X)[:, 1]
        print("ROC AUC Score:", roc_auc_score(y, y_prob))
    else:
        print("ROC AUC Score: N/A (model doesn't support probability estimates)")

    return y_pred

def cat_predict(model, pipeline, X, y, name_set='test set'):
    X = pipeline.transform(X).toarray()
    
    start = time.time()
    y_pred = model.predict(X)
    stop = time.time()

    print(f'Time to predict on the {name_set} with {len(X)} examples: {round(stop-start, 4)}s')
    print(classification_report(y, y_pred))
    
    return y_pred

In [18]:
import pandas as pd
import numpy as np

In [24]:
data = pd.read_csv('downsample_data.csv', encoding='utf8')
data_test = pd.read_csv('test.csv', encoding='utf8')
X, y = data.post_message_preproced.values, data.label.values
X_test, y_test = data_test.post_message_preproced.values, data_test.label.values

print(X.shape, y.shape)
print(X_test.shape, y_test.shape)

(1073,) (1073,)
(793,) (793,)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import warnings

def find_best_model(model, param_grid, X_train, y_train):
    warnings.filterwarnings('ignore')
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_param = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best param: ", best_param)
    print("Best score: ", best_score)
    return grid_search.best_estimator_

In [31]:
options = [0,1,2]
names = ['KNN','Randomforest', 'Logistic']

for (option, name) in zip(options, names):
    print(f'{name} model:')
    model, pipeline = cat_models(X, y, option=option)
    _ = cat_predict(model=model, pipeline=pipeline, X=X_test, y=y_test)
    print('*'*130)

KNN model:
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best param:  {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
Best score:  0.7272065823473411
Report of train set with 858 examples: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       448
           1       1.00      1.00      1.00       410

    accuracy                           1.00       858
   macro avg       1.00      1.00      1.00       858
weighted avg       1.00      1.00      1.00       858


Report of validation set:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       114
           1       0.80      0.77      0.79       101

    accuracy                           0.80       215
   macro avg       0.80      0.80      0.80       215
weighted avg       0.80      0.80      0.80       215

Time to predict on the test set with 793 examples: 0.1994s
              precision    recall  f

In [33]:
cols = ['user_name_labelEncoder', 'num_like_post', 'num_comment_post', 'num_share_post']
X, y = data[cols].values, data.label.values
X_test, y_test = data_test[cols].values, data_test.label.values

print(X.shape, y.shape)
print(X_test.shape, y_test.shape)

options = [0,1,2]
names = ['KNN', 'DecisionTree', 'Logistic']

for (option, name) in zip(options, names):
    print(f'{name} model:')
    model = models(X, y, option=option)
    _ =  predict(model=model, X=X_test, y=y_test)
    print('*'*100)

(1073, 4) (1073,)
(793, 4) (793,)
KNN model:
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best param:  {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Best score:  0.6165170678634571
Report of train set with 858 examples: 
              precision    recall  f1-score   support

           0       0.69      0.78      0.73       448
           1       0.72      0.62      0.66       410

    accuracy                           0.70       858
   macro avg       0.71      0.70      0.70       858
weighted avg       0.70      0.70      0.70       858


Report of validation set:
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       114
           1       0.60      0.52      0.56       101

    accuracy                           0.61       215
   macro avg       0.61      0.60      0.60       215
weighted avg       0.61      0.61      0.61       215

Time to predict on the test set with 793 examples: 0.0629s
Pre