In [3]:
import pandas as pd
import warnings

from sklearn import decomposition

from sklearn.model_selection import GridSearchCV, cross_val_score

warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler


In [168]:
data = pd.read_csv('../../data/cleaned_data/all_columns.csv')

#### First we are going to try to find the best hyperparameters for the descision tree classifier

In [173]:
def decision_tree_grid_search(data, drop_text_data=False):

    if drop_text_data:
        data = data.drop(['full_text', 'user_description', 'username'], axis = 1)

    y = data.pop('real_fake_grade')

    X = data

    sc = StandardScaler()

    pca = decomposition.PCA()

    decisiontree = DecisionTreeClassifier()

    pipe = Pipeline(steps=[('sc', sc),
                           ('pca', pca),
                           ('decisiontree', decisiontree)])

    n_components = list(range(1,X.shape[1]+1,1))

    # Create lists of parameter for Decision Tree Classifier
    criterion = ['gini', 'entropy']
    max_depth = [4,6,8,12]

    # Create a dictionary of all the parameter options
    parameters = dict(pca__n_components=n_components,
                      decisiontree__criterion=criterion,
                      decisiontree__max_depth=max_depth)

    clf = GridSearchCV(pipe, parameters)

    # Fit the grid search
    clf.fit(X, y)

    # View The Best Parameters
    print('Best Criterion:', clf.best_estimator_.get_params()['decisiontree__criterion'])
    print('Best max_depth:', clf.best_estimator_.get_params()['decisiontree__max_depth'])
    print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
    print(); print(clf.best_estimator_.get_params()['decisiontree'])

    # Use Cross Validation To Evaluate Model
    cv_result = cross_val_score(clf, X, y, cv=4, n_jobs=-1)
    print("Cross validation scores:\n"); print(cv_result)

1.) Random search on all the metadata.

In [170]:
decision_tree_grid_search(data, drop_text_data=True)

Best Criterion: entropy
Best max_depth: 8
Best Number Of Components: 31

DecisionTreeClassifier(criterion='entropy', max_depth=8)
Cross validation scores:

[0.68993424 0.70495951 0.60374494 0.29807692]


#### 2. tweet_data_columns

In [174]:
tweet_data = pd.read_csv('../../data/cleaned_data/tweet_data_columns.csv')
tweet_data = tweet_data.drop('Unnamed: 0', axis=1)
decision_tree_grid_search(tweet_data)

Best Criterion: gini
Best max_depth: 8
Best Number Of Components: 15

DecisionTreeClassifier(max_depth=8)
Cross validation scores:

[0.58877086 0.61336032 0.57236842 0.52580972]


#### 3. User_data_columns

In [175]:
user_data = pd.read_csv('../../data/cleaned_data/user_data_columns.csv')
user_data = user_data.drop('Unnamed: 0', axis = 1)
decision_tree_grid_search(user_data)

Best Criterion: entropy
Best max_depth: 6
Best Number Of Components: 30

DecisionTreeClassifier(criterion='entropy', max_depth=6)
Cross validation scores:

[0.74152757 0.77125506 0.62550607 0.36538462]


#### Predictions with new hyperparameters.

In [8]:
def decision_tree_specific_search(data, dt_criterion, dt_max_depth, dt_components, drop_text_data=False):

    if drop_text_data:
        data = data.drop(['full_text', 'user_description', 'username'], axis = 1)

    y = data.pop('real_fake_grade')

    X = data

    sc = StandardScaler()

    pca = decomposition.PCA()

    decisiontree = DecisionTreeClassifier()

    pipe = Pipeline(steps=[('sc', sc),
                           ('pca', pca),
                           ('decisiontree', decisiontree)])

    parameters = dict(pca__n_components=dt_components,
                      decisiontree__criterion=dt_criterion,
                      decisiontree__max_depth=dt_max_depth)


    clf = GridSearchCV(pipe, parameters)


    clf.fit(X, y)

    print("Accuracy score was: {}".format(clf.score(X, y)))

### All the metadata

In [7]:
all_data = pd.read_csv('../../data/cleaned_data/all_columns.csv')

decision_tree_specific_search(all_data, dt_max_depth=[8], dt_components=[31], dt_criterion=["entropy"], drop_text_data=True)

Accuracy score was: 0.6894370651486401


### tweet data

In [215]:
tweet_data = pd.read_csv('../../data/cleaned_data/tweet_data_columns.csv')
tweet_data = tweet_data.drop('Unnamed: 0', axis=1)
decision_tree_specific_search(tweet_data, dt_max_depth=[8], dt_components=[15], dt_criterion=["gini"])

Accuracy score was: 0.7013282732447818


### all user data


In [216]:
user_data = pd.read_csv('../../data/cleaned_data/user_data_columns.csv')
user_data = user_data.drop('Unnamed: 0', axis = 1)
decision_tree_specific_search(user_data, dt_max_depth=[6],  dt_components=[30], dt_criterion=["entropy"])

Accuracy score was: 0.6884250474383302
