In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import enchant
import datetime
from nltk.corpus import brown
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
d = enchant.Dict("en_US")
word_set = set(brown.words())


In [2]:
# SET UP DATA

# convert csv to a pandas DataFrame format
train_df = pd.DataFrame.from_csv('data/train.csv')
test_df = pd.DataFrame.from_csv('data/test.csv')

In [3]:
#check misspellings
def spell_check_sentence(sentence):
    misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
    return sum(misspelled)

In [4]:
def create_features(df):
    # CREATING FEATURES

    #if tweet starts with quote
    quotes = [t[0] == '"' for t in df['text']]
    df['in_quotes'] = quotes

    #it tweet contains @realdonaldtrump
    df['uses_own_handle'] = ["@realDonaldTrump" in t for t in df['text']]

    #if tweet contains http
    df['contains_http'] = ["http" in t for t in df['text']]

    #if tweet contains hashtag
    df['contains_hashtag'] = ["#" in t for t in df['text']]

    #check for emojis (U+)
    df['contains_emojis'] = ["U+" in t for t in df['text']]

    #check length
    df['length'] = [len(t) for t in df['text']]

    #check num of mispellings
    df['num_of_misspellings'] = [spell_check_sentence(s) for s in df['text']]
    
    #converting created to time of day (in seconds)
    times = [t.split(' ')[1].split(':') for t in df['created']]
    df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]
    
    df['contains_any_at'] = ['@' in s for s in df['text']]
    
    return df

In [5]:
features = create_features(train_df)
features = features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'statusSource', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
labels = np.array(features['label'])
features = features.drop(['created', 'label', 'text', 'replyToSN'], axis = 1)
feature_list = list(features.columns)
features = np.array(features)

train_features, validate_features, train_labels, validate_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(train_features, train_labels)
ada2 = AdaBoostClassifier(n_estimators=500)
ada2.fit(train_features, train_labels)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=500, random_state=None)

In [7]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_features, train_labels)

GaussianNB(priors=None)

In [8]:
X = train_features
y = train_labels
from sklearn.svm import SVC
clf = SVC(gamma='auto', C = 3.)
clf.fit(X, y) 

SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
#IMPLEMENTING RANDOM FOREST
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [10]:
ada_pred = ada.predict(validate_features)
ada_pred2 = ada2.predict(validate_features)
rf_pred = rf.predict(validate_features)

ensemble_pred = np.clip((ada_pred+ada_pred2+rf_pred), -1, 1)

In [11]:
sum(ensemble_pred == validate_labels)/float(len(ensemble_pred))

0.88990825688073394

In [23]:
# Use the forest's predict method on the test data
test_features = create_features(test_df)
test_features = test_features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
test_features = test_features.drop(['created', 'text', 'replyToSN'], axis = 1)
test_features_list = list(test_features.columns)
test_features = np.array(test_features)

ada_pred = ada.predict(test_features)
ada_pred2 = ada2.predict(test_features)
rf_pred = rf.predict(test_features)

real_predictions = np.clip((ada_pred+ada_pred2+rf_pred), -1, 1)

In [24]:
pred_df = pd.DataFrame(np.arange(300))

In [25]:
pred_df['Label'] = real_predictions

In [29]:
pred_df.columns = ['ID', 'Label']

In [201]:
pred_df.to_csv('ensemble_predictions.csv', index=False)