In [20]:
# PLACE ALL IMPORTS IN THIS CELL

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import brown
import enchant
import datetime

In [5]:
d = enchant.Dict("en_US")

In [109]:
# SET UP DATA

# convert csv to a pandas DataFrame format
# train_df = pd.DataFrame.from_csv('data/train.csv')
test_df = pd.DataFrame.from_csv('data/test.csv')

In [7]:
# BASIC DATASET INFORMATION

# -- our train set is 1089 x 17
print(train_df.shape)
# -- our test set is 300 x 15 (the 2 missing columns are the label and a redundant field that resembles label)
print(test_df.shape)

(1089, 17)
(300, 15)


In [8]:
train_df.shape[0]

1089

In [9]:
# FIGURE OUT LABEL DISTRIBUTION [~57% of dataset came from Android]

# create dataset with only positive labels -- 619 positive labels in training set
df_train_pos = train_df[train_df.label == 1]
# create dataset with only negative labels -- 470 negative labels in training set
df_train_neg = train_df[train_df.label == -1]
print("Number of positive [Android] training points: " + str(len(df_train_pos)))
print("Number of negative [iPhone] training points: " + str(len(df_train_neg)))
print("Percentage of training points from Android: " + str(len(df_train_pos)/train_df.shape[0]))

Number of positive [Android] training points: 619
Number of negative [iPhone] training points: 470
Percentage of training points from Android: 0


In [10]:
# EXPLORE TIME FEATURE
print(train_df['created'])

id
0         7/12/16 0:56
1        7/11/16 22:18
2        7/11/16 21:40
3        7/11/16 19:51
4        7/11/16 11:57
5        7/10/16 18:58
6        7/10/16 18:42
7        7/10/16 18:27
8        7/10/16 12:02
9         7/9/16 21:22
10        7/9/16 15:18
11        7/8/16 23:26
12        7/8/16 21:31
13        7/8/16 14:32
14        7/8/16 13:20
15        7/8/16 11:02
16         7/8/16 1:52
17        7/7/16 20:09
18        7/7/16 20:06
19        7/7/16 20:04
20        7/7/16 18:07
21        7/7/16 11:33
22         7/7/16 1:34
23        7/6/16 21:58
24        7/6/16 14:06
25        7/6/16 14:01
26        7/6/16 13:24
27        7/6/16 13:12
28        7/6/16 13:08
29        7/6/16 12:31
             ...      
1059     1/16/16 19:32
1060     1/16/16 18:31
1061     1/16/16 18:26
1062     1/16/16 18:23
1063     1/16/16 18:22
1064     1/16/16 18:21
1065     1/16/16 16:11
1066     1/16/16 13:42
1067     1/16/16 13:31
1068      1/8/16 23:12
1069      1/8/16 22:36
1070      1/8/16 22:09
1071    

In [10]:
# DROPPING IRRELEVANT COLUMNS
train_df = train_df.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'statusSource', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)


In [91]:
def create_features(train_df):
    # CREATING FEATURES
    
    # DROPPING IRRELEVANT COLUMNS
    train_df = train_df.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'statusSource', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)
    print(train_df)

    #if tweet starts with quote
    quotes = [t[0] == '"' for t in train_df['text']]
    train_df['in_quotes'] = quotes

    #it tweet contains @realdonaldtrump
    train_df['uses_own_handle'] = ["@realDonaldTrump" in t for t in train_df['text']]

    #if tweet contains http
    train_df['contains_http'] = ["http" in t for t in train_df['text']]

    #if tweet contains hashtag
    train_df['contains_hashtag'] = ["#" in t for t in train_df['text']]

    #check misspellings
    word_set = set(brown.words())
    def spell_check_sentence(sentence):
        misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
        return sum(misspelled)

    #check for emojis (U+)
    train_df['contains_emojis'] = ["U+" in t for t in train_df['text']]

    #check length
    train_df['length'] = [len(t) for t in train_df['text']]

    #check num of mispellings
    train_df['num_of_misspellings'] = [spell_check_sentence(s) for s in train_df['text']]

    return train_df

In [48]:
t

Unnamed: 0_level_0,text,favoriteCount,replyToSN,created,retweetCount,label,in_quotes,uses_own_handle,contains_http,contains_hashtag,num_of_misspellings,contains_emojis,length,time_of_day_sec
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Senior United States District Judge Robert E. ...,14207,,7/12/16 0:56,5256,-1,False,False,True,False,2,False,140,3360
1,Speech on Veterans' Reform: https://t.co/XB7R...,9666,,7/11/16 22:18,3432,-1,False,False,True,False,3,False,52,80280
2,Great poll- Florida! Thank you! https://t.co/4...,25531,,7/11/16 21:40,8810,-1,False,False,True,False,4,False,55,78000
3,Thoughts and prayers with the victims; and the...,28850,,7/11/16 19:51,9112,-1,False,False,False,False,6,False,136,71460
4,Join me in Westfield; Indiana- tomorrow night ...,12567,,7/11/16 11:57,4144,-1,False,False,True,True,7,False,126,43020
5,I heard that the underachieving John King of @...,22978,,7/10/16 18:58,6564,1,False,False,False,False,4,False,139,68280
6,The media is so dishonest. If I make a stateme...,44600,,7/10/16 18:42,14520,1,False,False,False,False,4,False,140,67320
7,President Obama thinks the nation is not as di...,35167,,7/10/16 18:27,11975,1,False,False,False,False,2,False,113,66420
8,Look what is happening to our country under th...,55495,,7/10/16 12:02,19030,1,False,False,False,False,2,False,138,43320
9,New poll - thank you! #Trump2016 https://t.co...,24040,,7/9/16 21:22,9147,-1,False,False,True,True,4,False,81,76920


In [36]:
times = [t.split(' ')[1].split(':') for t in train_df['created']]
train_df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]

In [56]:
train_df

Unnamed: 0_level_0,text,favoriteCount,replyToSN,created,retweetCount,label,in_quotes,uses_own_handle,contains_http,contains_hashtag,num_of_misspellings,contains_emojis,length,time_of_day_sec
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Senior United States District Judge Robert E. ...,14207,,7/12/16 0:56,5256,-1,False,False,True,False,2,False,140,3360
1,Speech on Veterans' Reform: https://t.co/XB7R...,9666,,7/11/16 22:18,3432,-1,False,False,True,False,3,False,52,80280
2,Great poll- Florida! Thank you! https://t.co/4...,25531,,7/11/16 21:40,8810,-1,False,False,True,False,4,False,55,78000
3,Thoughts and prayers with the victims; and the...,28850,,7/11/16 19:51,9112,-1,False,False,False,False,6,False,136,71460
4,Join me in Westfield; Indiana- tomorrow night ...,12567,,7/11/16 11:57,4144,-1,False,False,True,True,7,False,126,43020
5,I heard that the underachieving John King of @...,22978,,7/10/16 18:58,6564,1,False,False,False,False,4,False,139,68280
6,The media is so dishonest. If I make a stateme...,44600,,7/10/16 18:42,14520,1,False,False,False,False,4,False,140,67320
7,President Obama thinks the nation is not as di...,35167,,7/10/16 18:27,11975,1,False,False,False,False,2,False,113,66420
8,Look what is happening to our country under th...,55495,,7/10/16 12:02,19030,1,False,False,False,False,2,False,138,43320
9,New poll - thank you! #Trump2016 https://t.co...,24040,,7/9/16 21:22,9147,-1,False,False,True,True,4,False,81,76920


In [30]:
(times)

[['0', '56'],
 ['22', '18'],
 ['21', '40'],
 ['19', '51'],
 ['11', '57'],
 ['18', '58'],
 ['18', '42'],
 ['18', '27'],
 ['12', '02'],
 ['21', '22'],
 ['15', '18'],
 ['23', '26'],
 ['21', '31'],
 ['14', '32'],
 ['13', '20'],
 ['11', '02'],
 ['1', '52'],
 ['20', '09'],
 ['20', '06'],
 ['20', '04'],
 ['18', '07'],
 ['11', '33'],
 ['1', '34'],
 ['21', '58'],
 ['14', '06'],
 ['14', '01'],
 ['13', '24'],
 ['13', '12'],
 ['13', '08'],
 ['12', '31'],
 ['12', '11'],
 ['11', '21'],
 ['11', '12'],
 ['4', '36'],
 ['4', '30'],
 ['4', '23'],
 ['2', '21'],
 ['0', '52'],
 ['12', '05'],
 ['11', '14'],
 ['22', '59'],
 ['22', '30'],
 ['15', '34'],
 ['15', '26'],
 ['14', '25'],
 ['14', '19'],
 ['14', '11'],
 ['13', '56'],
 ['13', '42'],
 ['20', '16'],
 ['17', '40'],
 ['21', '13'],
 ['20', '39'],
 ['15', '33'],
 ['15', '32'],
 ['15', '23'],
 ['15', '19'],
 ['15', '19'],
 ['13', '55'],
 ['13', '45'],
 ['11', '55'],
 ['11', '48'],
 ['1', '29'],
 ['1', '19'],
 ['19', '47'],
 ['17', '07'],
 ['16', '43'],
 ['15

In [34]:
t = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]

In [35]:
t

[3360,
 80280,
 78000,
 71460,
 43020,
 68280,
 67320,
 66420,
 43320,
 76920,
 55080,
 84360,
 77460,
 52320,
 48000,
 39720,
 6720,
 72540,
 72360,
 72240,
 65220,
 41580,
 5640,
 79080,
 50760,
 50460,
 48240,
 47520,
 47280,
 45060,
 43860,
 40860,
 40320,
 16560,
 16200,
 15780,
 8460,
 3120,
 43500,
 40440,
 82740,
 81000,
 56040,
 55560,
 51900,
 51540,
 51060,
 50160,
 49320,
 72960,
 63600,
 76380,
 74340,
 55980,
 55920,
 55380,
 55140,
 55140,
 50100,
 49500,
 42900,
 42480,
 5340,
 4740,
 71220,
 61620,
 60180,
 57060,
 48720,
 48240,
 43920,
 43920,
 43860,
 81660,
 70200,
 70200,
 70140,
 70080,
 67980,
 9540,
 9000,
 6360,
 82620,
 66060,
 53280,
 52740,
 52620,
 47640,
 47340,
 10500,
 1860,
 70200,
 70080,
 69960,
 69840,
 69840,
 69780,
 69600,
 49140,
 47880,
 47220,
 5580,
 5460,
 2280,
 84240,
 83580,
 80880,
 80100,
 79740,
 76860,
 59460,
 58860,
 57960,
 41760,
 41580,
 41040,
 12720,
 9600,
 9540,
 9420,
 9420,
 57600,
 54420,
 53700,
 43200,
 16140,
 81060,
 7

In [76]:
train_df.apply(lambda row: row.label == 1 and 'MAGA' in row.text, axis=1).sum()

3

In [62]:
[d.check(x) for x in train_df['text'][0].split(" ")]

[True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False]

In [38]:
train_df

Unnamed: 0_level_0,text,favoriteCount,replyToSN,created,retweetCount,label,in_quotes,uses_own_handle,contains_http,contains_hashtag,num_of_misspellings,contains_emojis,length,time_of_day_sec
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Senior United States District Judge Robert E. ...,14207,,7/12/16 0:56,5256,-1,False,False,True,False,2,False,140,3360
1,Speech on Veterans' Reform: https://t.co/XB7R...,9666,,7/11/16 22:18,3432,-1,False,False,True,False,3,False,52,80280
2,Great poll- Florida! Thank you! https://t.co/4...,25531,,7/11/16 21:40,8810,-1,False,False,True,False,4,False,55,78000
3,Thoughts and prayers with the victims; and the...,28850,,7/11/16 19:51,9112,-1,False,False,False,False,6,False,136,71460
4,Join me in Westfield; Indiana- tomorrow night ...,12567,,7/11/16 11:57,4144,-1,False,False,True,True,7,False,126,43020
5,I heard that the underachieving John King of @...,22978,,7/10/16 18:58,6564,1,False,False,False,False,4,False,139,68280
6,The media is so dishonest. If I make a stateme...,44600,,7/10/16 18:42,14520,1,False,False,False,False,4,False,140,67320
7,President Obama thinks the nation is not as di...,35167,,7/10/16 18:27,11975,1,False,False,False,False,2,False,113,66420
8,Look what is happening to our country under th...,55495,,7/10/16 12:02,19030,1,False,False,False,False,2,False,138,43320
9,New poll - thank you! #Trump2016 https://t.co...,24040,,7/9/16 21:22,9147,-1,False,False,True,True,4,False,81,76920


In [54]:
test_df = create_features(test_df)

In [59]:
test_df.columns

Index([u'text', u'favorited', u'favoriteCount', u'replyToSN', u'created',
       u'truncated', u'replyToSID', u'id.1', u'replyToUID', u'screenName',
       u'retweetCount', u'isRetweet', u'retweeted', u'longitude', u'latitude',
       u'in_quotes', u'uses_own_handle', u'contains_http', u'contains_hashtag',
       u'contains_emojis', u'length', u'num_of_misspellings'],
      dtype='object')

In [70]:
train_df

Unnamed: 0_level_0,text,favoriteCount,replyToSN,created,retweetCount,label,in_quotes,uses_own_handle,contains_http,contains_hashtag,num_of_misspellings,contains_emojis,length,time_of_day_sec
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Senior United States District Judge Robert E. ...,14207,,7/12/16 0:56,5256,-1,False,False,True,False,2,False,140,3360
1,Speech on Veterans' Reform: https://t.co/XB7R...,9666,,7/11/16 22:18,3432,-1,False,False,True,False,3,False,52,80280
2,Great poll- Florida! Thank you! https://t.co/4...,25531,,7/11/16 21:40,8810,-1,False,False,True,False,4,False,55,78000
3,Thoughts and prayers with the victims; and the...,28850,,7/11/16 19:51,9112,-1,False,False,False,False,6,False,136,71460
4,Join me in Westfield; Indiana- tomorrow night ...,12567,,7/11/16 11:57,4144,-1,False,False,True,True,7,False,126,43020
5,I heard that the underachieving John King of @...,22978,,7/10/16 18:58,6564,1,False,False,False,False,4,False,139,68280
6,The media is so dishonest. If I make a stateme...,44600,,7/10/16 18:42,14520,1,False,False,False,False,4,False,140,67320
7,President Obama thinks the nation is not as di...,35167,,7/10/16 18:27,11975,1,False,False,False,False,2,False,113,66420
8,Look what is happening to our country under th...,55495,,7/10/16 12:02,19030,1,False,False,False,False,2,False,138,43320
9,New poll - thank you! #Trump2016 https://t.co...,24040,,7/9/16 21:22,9147,-1,False,False,True,True,4,False,81,76920


In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [71]:
labels = np.array(train_df['label'])
features = train_df.drop(['created', 'label', 'text', 'replyToSN'], axis = 1)
feature_list = list(features.columns)
features = np.array(features)

train_features, validate_features, train_labels, validate_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [72]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [74]:
# Use the forest's predict method on the test data
predictions = rf.predict(validate_features)
# Calculate the absolute errors
errors = abs(predictions - validate_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

('Mean Absolute Error:', 0.22, 'degrees.')


In [106]:
feature_list

['favoriteCount',
 'retweetCount',
 'in_quotes',
 'uses_own_handle',
 'contains_http',
 'contains_hashtag',
 'num_of_misspellings',
 'contains_emojis',
 'length',
 'time_of_day_sec']

In [107]:
test_df.columns

Index([u'favoriteCount', u'retweetCount', u'in_quotes', u'uses_own_handle',
       u'contains_http', u'contains_hashtag', u'contains_emojis', u'length',
       u'num_of_misspellings'],
      dtype='object')

In [113]:
# Use the forest's predict method on the test data
real_predictions = rf.predict(test_df)

In [121]:
real_pred = pd.DataFrame(np.arange(300))

In [123]:
real_pred['Label'] = real_predictions

In [128]:
real_pred.to_csv('predictions_rf.csv', index=False)

In [116]:
 real_pred.rename(index=str, columns={"0": "a", "B": "c"})

Unnamed: 0,0
0,1
1,-1
2,-1
3,1
4,1
5,1
6,-1
7,-1
8,1
9,-1


In [110]:
# CREATING FEATURES

# DROPPING IRRELEVANT COLUMNS
test_df = test_df.drop(['favorited', 'truncated', 'replyToSID', 'id.1', 'replyToUID', 'screenName', 'isRetweet', 'retweeted','longitude', 'latitude'], axis=1)


#if tweet starts with quote
quotes = [t[0] == '"' for t in test_df['text']]
test_df['in_quotes'] = quotes

#it tweet contains @realdonaldtrump
test_df['uses_own_handle'] = ["@realDonaldTrump" in t for t in test_df['text']]

#if tweet contains http
test_df['contains_http'] = ["http" in t for t in test_df['text']]

#if tweet contains hashtag
test_df['contains_hashtag'] = ["#" in t for t in test_df['text']]

#check misspellings
word_set = set(brown.words())
def spell_check_sentence(sentence):
    misspelled = [not d.check(x) if x!= "" else False for x in sentence.split(" ")]
    return sum(misspelled)

#check for emojis (U+)
test_df['contains_emojis'] = ["U+" in t for t in test_df['text']]

#check length
test_df['length'] = [len(t) for t in test_df['text']]

#check num of mispellings
test_df['num_of_misspellings'] = [spell_check_sentence(s) for s in test_df['text']]



In [111]:
times = [t.split(' ')[1].split(':') for t in test_df['created']]
test_df['time_of_day_sec'] = [datetime.timedelta(hours = int(time[0]), minutes = int(time[1])).seconds for time in times]

In [112]:
test_df = test_df.drop(['text', 'replyToSN', 'created'], axis = 1)

Unnamed: 0_level_0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id.1,replyToUID,screenName,...,retweeted,longitude,latitude,in_quotes,uses_own_handle,contains_http,contains_hashtag,contains_emojis,length,num_of_misspellings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,My economic policy speech will be carried live...,False,9214,,8/8/2016 15:20,False,,7.630000e+17,,realDonaldTrump,...,False,,,False,False,False,False,False,67,3
1,Join me in Fayetteville; North Carolina tomorr...,False,6981,,8/8/2016 13:28,False,,7.630000e+17,,realDonaldTrump,...,False,,,False,False,True,False,False,114,4
2,"#ICYMI: ""Will Media Apologize to Trump?"" https...",False,15724,,8/8/2016 0:05,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,True,True,False,64,4
3,Michael Morell; the lightweight former Acting ...,False,19837,,8/7/2016 23:09,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,False,False,False,134,4
4,The media is going crazy. They totally distort...,False,34051,,8/7/2016 21:31,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,False,False,False,135,8
5,I see where Mayor Stephanie Rawlings-Blake of ...,False,29831,,8/7/2016 13:49,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,False,False,False,138,4
6,Thank you Windham; New Hampshire! #TrumpPence1...,False,19223,,8/7/2016 2:19,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,True,True,False,77,5
7,.@Larry_Kudlow - 'Donald Trump Is the middle-c...,False,19543,,8/7/2016 2:03,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,True,False,False,93,5
8,I am not just running against Crooked Hillary ...,False,75488,,8/7/2016 1:53,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,False,False,False,137,2
9,#CrookedHillary is not fit to be our next pres...,False,23661,,8/6/2016 20:04,False,,7.620000e+17,,realDonaldTrump,...,False,,,False,False,True,True,False,92,4


In [None]:
real_predictions = rf.predict(test)

In [82]:
sum(validate_labels == predictions)

194

In [83]:
len(validate_labels)

218

In [84]:
194/218.

0.8899082568807339

In [85]:
validate_labels == predictions

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [79]:
validate_labels

array([ 1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1, -1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1,
       -1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1])

In [77]:
predictions

array([ 1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,
        1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
       -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,
        1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,
        1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1])

In [87]:
errors

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0])

In [76]:
mape = 100 * (errors / validate_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

('Accuracy:', 111.01, '%.')


In [None]:

# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)