In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import enchant
import datetime
from nltk.corpus import brown
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
d = enchant.Dict("en_US")
word_set = set(brown.words())


In [4]:
# SET UP DATA

# convert csv to a pandas DataFrame format
train_df = pd.DataFrame.from_csv('data/train.csv')
test_df = pd.DataFrame.from_csv('data/test.csv')

In [5]:
#check misspellings
def spell_check_sentence(sentence):
    misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
    return sum(misspelled)

In [22]:
def create_features(df):
    # CREATING FEATURES

    #if tweet starts with quote
    quotes = [t[0] == '"' for t in df['text']]
    df['in_quotes'] = quotes

    #it tweet contains @realdonaldtrump
    df['uses_own_handle'] = ["@realDonaldTrump" in t for t in df['text']]

    #if tweet contains http
    df['contains_http'] = ["http" in t for t in df['text']]

    #if tweet contains hashtag
    df['contains_hashtag'] = ["#" in t for t in df['text']]

    #check for emojis (U+)
    df['contains_emojis'] = ["U+" in t for t in df['text']]

    #check length
    df['length'] = [len(t) for t in df['text']]

    #check num of mispellings
    df['num_of_misspellings'] = [spell_check_sentence(s) for s in df['text']]
    
    #converting created to time of day (in seconds)
    times = [t.split(' ')[1].split(':') for t in df['created']]
    df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]
    
    df['contains_any_at'] = ['@' in s for s in df['text']]
    
    return df

In [7]:
features = create_features(train_df)
features = features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'statusSource', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
labels = np.array(features['label'])
features = features.drop(['created', 'label', 'text', 'replyToSN'], axis = 1)
feature_list = list(features.columns)
features = np.array(features)

train_features, validate_features, train_labels, validate_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [10]:
#IMPLEMENTING RANDOM FOREST
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [12]:
rf_pred

array([ 1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,
        1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
        1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1])

In [13]:
rf_pred = rf.predict(validate_features)
sum(rf_pred == validate_labels)/float(len(rf_pred))

0.88532110091743121

In [21]:
feature_list

['favoriteCount',
 'retweetCount',
 'in_quotes',
 'uses_own_handle',
 'contains_http',
 'contains_hashtag',
 'contains_emojis',
 'length',
 'num_of_misspellings',
 'time_of_day_sec',
 'contains_any_at']

In [23]:
# Use the forest's predict method on the test data
test_features = create_features(test_df)
test_features = test_features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
test_features = test_features.drop(['created', 'text', 'replyToSN'], axis = 1)
test_features_list = list(test_features.columns)
test_features = np.array(test_features)
real_predictions = rf.predict(test_features)

In [24]:
pred_df = pd.DataFrame(np.arange(300))

In [25]:
pred_df['Label'] = real_predictions

In [28]:
pred_df.rename(index=str, columns={"0": "ID", "Label": "Label"})

Unnamed: 0,0,Label
0,0,1
1,1,-1
2,2,-1
3,3,1
4,4,1
5,5,1
6,6,-1
7,7,-1
8,8,1
9,9,-1


In [201]:
pred_df.to_csv('predictions.csv', index=False)