##

### 1 Import packages and load preprocessed dataframe

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import pickle as pk
import time
from tqdm import tqdm
from utilities import clean_posts, postVectorizer
from RF import RandomForest, cross_validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier,plot_importance

method_dict = {
    'RF': 'RandomForest',
    'SVM': 'SVM',
    'XGB': 'XGBoost',
    'DL': 'DeepLearning'
}

In [2]:
with open('./tfidf_df.pk', 'rb') as pkl:
    tfidf_df = pk.load(pkl)

with open('./df.pk', 'rb') as pkl:
    df = pk.load(pkl)
    
# with open('./tfidf_df_IE.pk', 'rb') as pkl:
#     tfidf_df_IE = pk.load(pkl)

# with open('./tfidf_df_NS.pk', 'rb') as pkl:
#     tfidf_df_NS = pk.load(pkl)

# with open('./tfidf_df_TF.pk', 'rb') as pkl:
#     tfidf_df_TF = pk.load(pkl)

# with open('./tfidf_df_JP.pk', 'rb') as pkl:
#     tfidf_df_JP = pk.load(pkl)

### 2 Training and Benchmarking

In [3]:
# train_by_type
# @params:
#   _type: (string), type to be classified; types:[IE, NS, TF, JP]
#  method: (string), type of the classifier; methods:['RF', 'SVM', 'XGB']
##

def train_by_type(_type, method='RF', benchmark=False):
    y = df[_type].values.reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(tfidf_df.values, y, test_size=0.33, train_size=0.67,
                                                        random_state=13, shuffle=True, stratify=y)
    
    print("# @Training START #")
    if method == 'RF':
#         print(X_train.shape, y_train.reshape(-1,1).shape)
        clf = RandomForest(n_estimators=100, verbose=0, min_leaf_size=3)
        clf.fit(np.concatenate((X_train, y_train), axis=1))
    
    elif method == 'SVM':
        clf = SVC(gamma='auto', probability=True)
        clf.fit(X_train, y_train)
    
    elif method == 'XGB':
        clf = XGBClassifier()
        clf.fit(X_train, y_train, 
                early_stopping_rounds=10,
                eval_metric="logloss",
                eval_set=[(X_test, y_test)],
                verbose=False)
    else:
        raise ValueError("Invalid Method.")
        
    print("# @Training END #\n")
    
    time.sleep(0.5)
    
    if benchmark:
        print("# @Scoring START # --- {:s}".format(method_dict[method]))
        time.sleep(0.5)
        print("Type: {}: {} : {} = {} : {}".format(_type, _type[0], _type[1], sum(y)/len(y),1-sum(y)/len(y)))
        time.sleep(0.5)
        pred_train = [clf.predict(i.reshape(1,-1)) for i in tqdm(X_train)]
        print("Accuracy on training set - %s" % _type, accuracy_score(y_train, pred_train))
        print("F1 Score on training set - %s" % _type, (if set(pred_train) f1_score(y_train, pred_train))
        time.sleep(0.5)
        pred_test = [clf.predict(i.reshape(1,-1)) for i in tqdm(X_test)]
        print("Accuracy on testing set - %s" % _type, accuracy_score(y_test, pred_test))
        print("F1 Score on testing set - %s" % _type, f1_score(y_test, pred_test))
        print("# @Scoring END #\n")
    
    return clf

In [4]:
clf_ie = train_by_type('IE', 'RF', benchmark=True)
clf_ns = train_by_type('NS', 'RF', benchmark=True)
clf_tf = train_by_type('TF', 'RF', benchmark=True)
clf_jp = train_by_type('JP', 'RF', benchmark=True)

# @Training START #
# @Training END #

# @Scoring START # --- RandomForest
Type: IE: I : E = [0.23043228]


100%|██████████| 5812/5812 [00:27<00:00, 208.47it/s]


Accuracy on training set - IE 0.9970750172057812
F1 Score on training set - IE 0.9936114242765878


100%|██████████| 2863/2863 [00:13<00:00, 209.52it/s]
  'precision', 'predicted', average, warn_for)


Accuracy on testing set - IE 0.7694725812085226
F1 Score on testing set - IE 0.0
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- RandomForest
Type: NS: N : S = [0.13798271]


100%|██████████| 5812/5812 [00:28<00:00, 202.73it/s]


Accuracy on training set - NS 0.9805574673090158
F1 Score on training set - NS 0.9242119382964453


100%|██████████| 2863/2863 [00:13<00:00, 209.13it/s]
  'precision', 'predicted', average, warn_for)


Accuracy on testing set - NS 0.8620328326929794
F1 Score on testing set - NS 0.0
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- RandomForest
Type: TF: T : F = [0.4589049]


100%|██████████| 5812/5812 [00:27<00:00, 208.49it/s]


Accuracy on training set - TF 0.9998279421885754
F1 Score on training set - TF 0.9998125585754452


100%|██████████| 2863/2863 [00:13<00:00, 209.09it/s]


Accuracy on testing set - TF 0.7027593433461404
F1 Score on testing set - TF 0.625604927408711
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- RandomForest
Type: JP: J : P = [0.39585014]


100%|██████████| 5812/5812 [00:27<00:00, 208.71it/s]


Accuracy on training set - JP 0.9994838265657261
F1 Score on training set - JP 0.9993476842791911


100%|██████████| 2863/2863 [00:13<00:00, 208.81it/s]

Accuracy on testing set - JP 0.6234718826405868
F1 Score on testing set - JP 0.1820940819423369
# @Scoring END #






In [80]:
kaggle_posts = "./testdata/ForumMessages.csv"
kdf = pd.read_csv(kaggle_posts)
kdf = kdf.drop(columns=["ForumTopicId", "Id", "PostDate", "ReplyToForumMessageId", "Medal", "MedalAwardDate"])
kdf = kdf.dropna()
kdf.Message = kdf.Message.apply(lambda x:np.NaN if len(x)<200 else x)
kdf = kdf.dropna()

In [94]:
clean_posts(kdf.values[32932][1])

'pi followed your link and i didnt findnbspa href which can take in sparse matrices and is probably best suited to large datasets like ours it works just like any other classifierp pcodeclf sgdclassifier clffitx y etcetccodenbspp pas for your error what is typextrain and typeytrain QST i think sklearn only supports csrmode sparse matricesp'