In [1]:
import sklearn
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
import os

Load and Split the Data

In [2]:
def unPickleData(filename):
  with open(filename,'rb') as f: arr = pickle.load(f)
  return arr

def getDataPath(dirname,filename):
  return os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)),"processed_data", dirname,filename)

x_data = unPickleData(getDataPath("posts","X_posts.pkl"))
y_data = unPickleData(getDataPath("posts","Y_posts.pkl"))

In [3]:
X, y = [], []
Class0_max_count = 45000
for i in range(len(x_data)):
    if y_data[i] == 0: 
        if Class0_max_count > 0:
            X.append(x_data[i])
            y.append(y_data[i])
            Class0_max_count -= 1 
    else:
        X.append(x_data[i])
        y.append(y_data[i])

np_y = np.array(y)
print("class 0:", len(np_y[np_y==0]))
print("class 1:", len(np_y[np_y==1]))

class 0: 45000
class 1: 9114


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6 , random_state= 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5 , random_state= 42)

Random Forests Training

In [5]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

Model Selection

In [4]:
param_grid = { 
    'n_estimators': [200, 500]
}

CV_rfc = GridSearchCV(estimator=classifier, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

print(CV_rfc.best_params_)

print("Accuracy :",CV_rfc.best_score_)

bestClassifier = CV_rfc.best_estimator_

Testing

In [None]:
y_pred = bestClassifier.predict(X_test)

print(sklearn.metrics.classification_report(y_test,y_pred))
print(f"Accuracy = {sklearn.metrics.accuracy_score(y_test, y_pred)}")
print(f"f-measure = {sklearn.metrics.f1_score(y_test, y_pred, average='micro')}")

Saving the Model

In [None]:
def getSavedModelPath(dirname,filename):
  return os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)),"savedModels", dirname,filename)

with open(getSavedModelPath("posts","RandomForests.pkl"), 'wb') as f:
    pickle.dump(bestClassifier,f)

Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

train_size, train_score, test_scores = learning_curve(bestClassifier, X_train, y_train, cv = 10, scoring='accuracy', n_jobs=1, train_sizes=np.linspace(0.01, 1, 50), verbose=1)
mean_train = np.mean(train_score , axis = 1)
mean_test = np.mean(test_scores , axis = 1)
plt.plot(train_size, 1-mean_train, label='Training score')
plt.plot(train_size, 1-mean_test, label='Cross-validation score')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend()
plt.show()