In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import (RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [34]:
def preprocess_data(train, test, vectorizer):
    train_bow = vectorizer.fit_transform(train['text'])
    test_bow = vectorizer.transform(test['text'])
    print("Shape of train matrix : ",train_bow.shape)
    print("Shape of test matrix : ",test_bow.shape) 

    X_tr = train_bow
    y_tr = train['label']
    X_ts = test_bow
    y_ts = test['label']
    return X_tr, y_tr, X_ts, y_ts

In [35]:
def train_models(X_tr, y_tr, X_ts, y_ts, models):
    for model in models:
        score = cross_val_score(model, X_tr, y_tr, cv=5)
        msg = ("{0}:\n\tMean accuracy on development set\t= {1:.3f} "
            "(+/- {2:.3f})".format(model.__class__.__name__,
                                    score.mean(),
                                    score.std()))
        print(msg)
        
        model.fit(X_tr, y_tr)
        pred_eval = model.predict(X_ts)
        acc_eval = accuracy_score(y_ts, pred_eval)
        print("\tAccuracy on evaluation set\t\t= {0:.3f}".format(acc_eval))
        print("Probability: ")
        proba = model.predict_proba(X_ts)
        print(proba)


In [36]:
df = pd.read_csv('preprocessing/data.csv', encoding='latin')
train, test = train_test_split(df, test_size=0.995, random_state=1)

In [37]:
ensamble_models = [ RandomForestClassifier(random_state=1),
                    GradientBoostingClassifier(random_state=1),
                    AdaBoostClassifier(random_state=1)  ]

In [38]:
countVectorizer = CountVectorizer()
X_tr, y_tr, X_ts, y_ts = preprocess_data(train, test, countVectorizer)

train_models(X_tr, y_tr, X_ts, y_ts, ensamble_models)

Shape of train matrix :  (224, 10670)
Shape of test matrix :  (44674, 10670)
RandomForestClassifier:
	Mean accuracy on development set	= 0.933 (+/- 0.040)
	Accuracy on evaluation set		= 0.972
Probability: 
[[0.28 0.72]
 [0.77 0.23]
 [0.3  0.7 ]
 ...
 [0.82 0.18]
 [0.27 0.73]
 [0.15 0.85]]
GradientBoostingClassifier:
	Mean accuracy on development set	= 0.996 (+/- 0.009)
	Accuracy on evaluation set		= 0.992
Probability: 
[[2.09721338e-05 9.99979028e-01]
 [9.99977176e-01 2.28243609e-05]
 [2.09721338e-05 9.99979028e-01]
 ...
 [9.99977176e-01 2.28243609e-05]
 [2.09721338e-05 9.99979028e-01]
 [2.09721338e-05 9.99979028e-01]]
AdaBoostClassifier:
	Mean accuracy on development set	= 0.996 (+/- 0.009)
	Accuracy on evaluation set		= 0.992
Probability: 
[[3.53049587e-07 9.99999647e-01]
 [9.99993409e-01 6.59111869e-06]
 [6.85570259e-06 9.99993144e-01]
 ...
 [9.99993409e-01 6.59111869e-06]
 [1.45952926e-07 9.99999854e-01]
 [9.23669317e-07 9.99999076e-01]]


In [39]:
tfidfVectorizer = TfidfVectorizer()
X_tr, y_tr, X_ts, y_ts = preprocess_data(train, test, tfidfVectorizer)

train_models(X_tr, y_tr, X_ts, y_ts, ensamble_models)

Shape of train matrix :  (224, 10670)
Shape of test matrix :  (44674, 10670)
RandomForestClassifier:
	Mean accuracy on development set	= 0.969 (+/- 0.023)
	Accuracy on evaluation set		= 0.968
Probability: 
[[0.24 0.76]
 [0.66 0.34]
 [0.37 0.63]
 ...
 [0.83 0.17]
 [0.36 0.64]
 [0.16 0.84]]
GradientBoostingClassifier:
	Mean accuracy on development set	= 0.996 (+/- 0.009)
	Accuracy on evaluation set		= 0.992
Probability: 
[[2.09721338e-05 9.99979028e-01]
 [9.99977176e-01 2.28243609e-05]
 [2.09721338e-05 9.99979028e-01]
 ...
 [9.99977176e-01 2.28243609e-05]
 [2.09721338e-05 9.99979028e-01]
 [2.09721338e-05 9.99979028e-01]]
AdaBoostClassifier:
	Mean accuracy on development set	= 0.996 (+/- 0.009)
	Accuracy on evaluation set		= 0.992
Probability: 
[[1.34170697e-07 9.99999866e-01]
 [9.99993466e-01 6.53406908e-06]
 [6.97380998e-06 9.99993026e-01]
 ...
 [9.99993466e-01 6.53406908e-06]
 [1.01829793e-06 9.99998982e-01]
 [3.58806957e-07 9.99999641e-01]]
