In [38]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#list of words to filter out of statements
stop_words = set(stopwords.words('english'))

Helper functions to clean and generate sentiment analysis

In [39]:
import ast
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import LancasterStemmer

# stemmer to turn words into their roots
lan=LancasterStemmer()

def preprocess_tweet_text(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+|ftp\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub('(@Tesla)','Tesla',tweet)
    tweet = re.sub('(&amp)','',tweet)
    tweet = re.sub(r'\@\w+|\#','', tweet)
    tweet = re.sub('[^\w ]','',tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet_tokens = word_tokenize(tweet,preserve_line=True)
    filtered_words = [lan.stem(w) for w in tweet_tokens if not w in stop_words]

    return " ".join(filtered_words)

def combine_tweets(tweet):
    tweet = ast.literal_eval(tweet)
    return " ".join(tweet)


def avg(data):
    data = ast.literal_eval(data)
    sum = 0
    for item in data:
        sum += item
    return sum/len(data)

def getSIA(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)

def vader(row):
    tweets = ast.literal_eval(row['Tweets'])
    pos= []
    neg= []
    neu= []
    for tweet in tweets:
        SIA = getSIA(preprocess_tweet_text(tweet))
        pos.append(SIA['pos'])
        neg.append(SIA['neu'])
        neu.append(SIA['neg'])
    return np.average(pos), np.average(neg), np.average(neu)

Read csv and turn data call helper functions

In [40]:
#original
from sklearn.model_selection import train_test_split

df=pd.read_csv("results.csv")
df['Tweets'] = df['Tweets'].apply(combine_tweets)
df['Tweets'] = df['Tweets'].apply(preprocess_tweet_text)
df['label'] = (df['Dif'] > 0) * 1

Split data and count vecotrize the features

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None, max_features = 5000)
x_train, x_test, y_train, y_test = train_test_split(df["Tweets"], df["label"], test_size = 0.3, random_state=42)

x_train = vectorizer.fit_transform(x_train)
x_train = x_train.toarray()
print(x_train.shape)
x_test = vectorizer.transform(x_test)
x_test = x_test.toarray()
print(x_test.shape)

(779, 5000)
(335, 5000)


Train bag of words model and evaluate its performance

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, f1_score

model = RandomForestClassifier(n_estimators=190, random_state=36).fit(x_train,y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test,y_pred)

print("Accuracy is:", accuracy_score(y_test, y_pred))

print("F-score is:", f1_score(y_test, y_pred))

classificationReport = classification_report(y_test, y_pred, target_names=['Price Drop', 'Price Rise'])
print("\nClassification Report:\n", classificationReport)

Accuracy is: 0.4537313432835821
F-score is: 0.5869074492099322

Classification Report:
               precision    recall  f1-score   support

  Price Drop       0.37      0.13      0.19       167
  Price Rise       0.47      0.77      0.59       168

    accuracy                           0.45       335
   macro avg       0.42      0.45      0.39       335
weighted avg       0.42      0.45      0.39       335



## Modification made
using Vader instead of bag of words

Read CSV and clean data

In [43]:
df=pd.read_csv("results.csv")

df['pos'] = df.apply(vader,axis=1)
df[['pos', 'neg', 'neu']] = pd.DataFrame(df['pos'].tolist(), index=df.index)
df['Likes'] = df['Likes'].apply(avg)
df['label'] = (df['Dif'] > 0) * 1

Filter columns for features and label

In [44]:
keep = ['pos', 'neu','neg','label', 'Likes']
filtered = df[keep]
print(filtered.columns)

Index(['pos', 'neu', 'neg', 'label', 'Likes'], dtype='object')


Split data

In [45]:
from sklearn.model_selection import train_test_split

X = filtered.loc[ : , filtered.columns != 'label']
y = filtered['label']

x_train, x_test, y_train, y_test =  train_test_split(X , y, test_size = 0.2, random_state = 0)

Train model, and generate the scores

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, f1_score

model = RandomForestClassifier(n_estimators=190, random_state=36).fit(x_train,y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test,y_pred)

print("Accuracy is:", accuracy_score(y_test, y_pred))

print("F-score is:", f1_score(y_test, y_pred))

classificationReport = classification_report(y_test, y_pred, target_names=['Price Drop', 'Price Rise'])
print("\nClassification Report:\n", classificationReport)

Accuracy is: 0.6143497757847534
F-score is: 0.6814814814814815

Classification Report:
               precision    recall  f1-score   support

  Price Drop       0.62      0.44      0.51       103
  Price Rise       0.61      0.77      0.68       120

    accuracy                           0.61       223
   macro avg       0.61      0.60      0.60       223
weighted avg       0.61      0.61      0.60       223

