In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import brown
import enchant
import datetime
d = enchant.Dict("en_US")
word_set = set(brown.words())

In [184]:
# SET UP DATA

# convert csv to a pandas DataFrame format
train_df = pd.DataFrame.from_csv('data/train.csv')
test_df = pd.DataFrame.from_csv('data/test.csv')

In [7]:
# BASIC DATASET INFORMATION

# -- our train set is 1089 x 17
print(train_df.shape)
# -- our test set is 300 x 15 (the 2 missing columns are the label and a redundant field that resembles label)
print(test_df.shape)

(1089, 17)
(300, 15)


In [8]:
train_df.shape[0]

1089

In [9]:
# FIGURE OUT LABEL DISTRIBUTION [~57% of dataset came from Android]

# create dataset with only positive labels -- 619 positive labels in training set
df_train_pos = train_df[train_df.label == 1]
# create dataset with only negative labels -- 470 negative labels in training set
df_train_neg = train_df[train_df.label == -1]
print("Number of positive [Android] training points: " + str(len(df_train_pos)))
print("Number of negative [iPhone] training points: " + str(len(df_train_neg)))
print("Percentage of training points from Android: " + str(len(df_train_pos)/train_df.shape[0]))

Number of positive [Android] training points: 619
Number of negative [iPhone] training points: 470
Percentage of training points from Android: 0


In [130]:
predictions2 = rf2.predict(validate_features)
predictions3 = rf3.predict(validate_features)
predictions4 = rf4.predict(validate_features)
predictions5 = rf5.predict(validate_features)

In [132]:
pred_sum = predictions+predictions2+predictions3+predictions4+predictions5

In [135]:
clipped_sum = np.clip(pred_sum, -1, 1)

In [138]:
sum(clipped_sum == validate_labels)

194

In [74]:
# Use the forest's predict method on the test data
predictions = rf.predict(validate_features)
# Calculate the absolute errors
errors = abs(predictions - validate_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

('Mean Absolute Error:', 0.22, 'degrees.')


In [106]:
feature_list

['favoriteCount',
 'retweetCount',
 'in_quotes',
 'uses_own_handle',
 'contains_http',
 'contains_hashtag',
 'num_of_misspellings',
 'contains_emojis',
 'length',
 'time_of_day_sec']

In [107]:
test_df.columns

Index([u'favoriteCount', u'retweetCount', u'in_quotes', u'uses_own_handle',
       u'contains_http', u'contains_hashtag', u'contains_emojis', u'length',
       u'num_of_misspellings'],
      dtype='object')

In [113]:
# Use the forest's predict method on the test data
real_predictions = rf.predict(test_df)

In [198]:
ensemble_df = pd.DataFrame(np.arange(300))

In [199]:
ensemble_df['Label'] = ensemble_pred

In [200]:
ensemble_df.rename(index=str, columns={"0": "ID", "Label": "Label"})

Unnamed: 0,0,Label
0,0,1
1,1,1
2,2,-1
3,3,1
4,4,1
5,5,1
6,6,-1
7,7,-1
8,8,1
9,9,-1


In [201]:
ensemble_df.to_csv('predictions_2ada1rf.csv', index=False)

In [121]:
real_pred = pd.DataFrame(np.arange(300))

In [123]:
real_pred['Label'] = real_predictions

In [128]:
real_pred.to_csv('predictions_rf.csv', index=False)

In [116]:
 real_pred.rename(index=str, columns={"0": "a", "B": "c"})

Unnamed: 0,0
0,1
1,-1
2,-1
3,1
4,1
5,1
6,-1
7,-1
8,1
9,-1


In [185]:
# CREATING FEATURES

# DROPPING IRRELEVANT COLUMNS
test_df = test_df.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)


#if tweet starts with quote
quotes = [t[0] == '"' for t in test_df['text']]
test_df['in_quotes'] = quotes

#it tweet contains @realdonaldtrump
test_df['uses_own_handle'] = ["@realDonaldTrump" in t for t in test_df['text']]

#if tweet contains http
test_df['contains_http'] = ["http" in t for t in test_df['text']]

#if tweet contains hashtag
test_df['contains_hashtag'] = ["#" in t for t in test_df['text']]

#check misspellings
word_set = set(brown.words())
def spell_check_sentence(sentence):
    misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
    return sum(misspelled)

#check for emojis (U+)
test_df['contains_emojis'] = ["U+" in t for t in test_df['text']]

#check length
test_df['length'] = [len(t) for t in test_df['text']]

#check num of mispellings
test_df['num_of_misspellings'] = [spell_check_sentence(s) for s in test_df['text']]



array([ 1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,
       -1, -1,  1,  1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
        1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1,
        1,  1, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1,
       -1,  1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1,
        1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,
       -1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1,
        1,  1, -1,  1,  1

In [186]:
times = [t.split(' ')[1].split(':') for t in test_df['created']]
test_df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]

In [189]:
test_df['contains_any_at'] = ['@' in s for s in test_df['text']]
test_df = test_df.drop(['text', 'replyToSN', 'created'], axis = 1)

In [190]:
test_df

Unnamed: 0_level_0,favoriteCount,retweetCount,in_quotes,uses_own_handle,contains_http,contains_hashtag,contains_emojis,length,num_of_misspellings,time_of_day_sec,contains_any_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,9214,3107,False,False,False,False,False,67,3,55200,False
1,6981,2390,False,False,True,False,False,114,4,48480,False
2,15724,6691,False,False,True,True,False,64,4,300,False
3,19837,6402,False,False,False,False,False,134,4,83340,False
4,34051,11717,False,False,False,False,False,135,8,77460,False
5,29831,9892,False,False,False,False,False,138,4,49740,False
6,19223,5784,False,False,True,True,False,77,5,8340,False
7,19543,7930,False,False,True,False,False,93,5,7380,True
8,75488,24663,False,False,False,False,False,137,2,6780,False
9,23661,7903,False,False,True,True,False,92,4,72240,False


In [180]:
real_predictions = rf.predict(test)

NameError: name 'test' is not defined

In [82]:
sum(validate_labels == predictions)

194

In [83]:
len(validate_labels)

218

In [84]:
194/218.

0.8899082568807339

In [85]:
validate_labels == predictions

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [79]:
validate_labels

array([ 1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1, -1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1,
       -1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1])

In [77]:
predictions

array([ 1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,
        1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,
        1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
        1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1])

In [87]:
errors

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0])

In [76]:
mape = 100 * (errors / validate_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

('Accuracy:', 111.01, '%.')


In [94]:
test_df = create_features(test_df)

ValueError: labels ['statusSource'] not contained in axis

In [None]:

# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)