In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import enchant
import datetime
from nltk.corpus import brown
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
d = enchant.Dict("en_US")
word_set = set(brown.words())


In [2]:
# SET UP DATA

# convert csv to a pandas DataFrame format
train_df = pd.DataFrame.from_csv('data/train.csv')
test_df = pd.DataFrame.from_csv('data/test.csv')

In [3]:
#check misspellings
def spell_check_sentence(sentence):
    misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
    return sum(misspelled)

In [4]:
def create_features(df):
    # CREATING FEATURES

    #if tweet starts with quote
    quotes = [t[0] == '"' for t in df['text']]
    df['in_quotes'] = quotes

    #it tweet contains @realdonaldtrump
    df['uses_own_handle'] = ["@realDonaldTrump" in t for t in df['text']]

    #if tweet contains http
    df['contains_http'] = ["http" in t for t in df['text']]

    #if tweet contains hashtag
    df['contains_hashtag'] = ["#" in t for t in df['text']]

    #check for emojis (U+)
    df['contains_emojis'] = ["U+" in t for t in df['text']]

    #check length
    df['length'] = [len(t) for t in df['text']]

    #check num of mispellings
    df['num_of_misspellings'] = [spell_check_sentence(s) for s in df['text']]
    
    #converting created to time of day (in seconds)
    times = [t.split(' ')[1].split(':') for t in df['created']]
    df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]
    
    df['contains_any_at'] = ['@' in s for s in df['text']]
    
    return df

In [5]:
features = create_features(train_df)
features = features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'statusSource', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
labels = np.array(features['label'])
features = features.drop(['created', 'label', 'text', 'replyToSN'], axis = 1)
feature_list = list(features.columns)
features = np.array(features)

train_features, validate_features, train_labels, validate_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [10]:
xgboost = XGBClassifier(max_depth=5)
xgboost.fit(train_features, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
sum(xgboost.predict(validate_features) == validate_labels)/float(len(validate_features))

0.87155963302752293

In [18]:
# Use the forest's predict method on the test data
test_features = create_features(test_df)
test_features = test_features.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
test_features = test_features.drop(['created', 'text', 'replyToSN'], axis = 1)
test_features_list = list(test_features.columns)
test_features = np.array(test_features)

ada_pred = ada.predict(test_features)

In [19]:
pred_df = pd.DataFrame(np.arange(300))

In [20]:
pred_df['Label'] = ada_pred

In [21]:
pred_df.columns = ['ID', 'Label']

In [23]:
pred_df.to_csv('ada_predictions.csv', index=False)