# Importing the Data

Suprisingly, importing and cleaning the data actually takes a lot of work, so we will put it in a section by itself.

In [1]:
#@title A bunch of packages to download/import


!pip install -U -q PyDrive
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import defaultdict

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.utils import plot_model

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
'''
  This is just code to read csv files into google collab.
  It will ask you for access to your google drive. 
  This is just to download the data easier. 
  We aren't gonna access any of your files!

  Reference: https://stackoverflow.com/questions/48376580/google-colab-how-to-read-data-from-my-google-drive
'''
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
#Reading in the data
stockData = drive.CreateFile({'id':"1yElGQFdx45VmNFMDOowx8Xk-0bCYfuwS"}) 
stockData.GetContentFile('stock_data.csv')
data = pd.read_csv('stock_data.csv')
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


Here, we realize that the text is not exactly 'clean'. Hence, we need to use a variety of methods in order to make the sentences actually readable to the models.

Reference to parts of code: https://www.analyticsvidhya.com/blog/2022/03/building-naive-bayes-classifier-from-scratch-to-perform-sentiment-analysis/#:~:text=What%20is%20a%20Naive%20Bayes,or%20sentimental%20tone%20or%20opinion)

In [4]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def remove_tags(string):
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^\w'+""+']', ' ',result) #remove non-alphanumeric characters
    result = re.sub(" \d+", " ", result) #remove numbers
    result = result.lower() #make all letters lowercase
    return result

def lemmatize_text(text): #Turns all text to basic form (like past test to present tense, removing the -ing in verbs)
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

data['Text']= data['Text'].apply(lambda cw : remove_tags(cw)) 
data['Text'] = data['Text'].apply(lemmatize_text)
cleaned_data = data
cleaned_data.head()

Unnamed: 0,Text,Sentiment
0,kicker on my watchlist xide tit soq pnk cpw bp...,1
1,user aap movie return for the fea geed indicat...,1
2,user i d be afraid to short amzn they are look...,1
3,mnta over,1
4,oi over,1


That looks a lot better (well, not to us, cause it doesn't seem readable, but the computers will like it)! There's just one final step we need to do. We shall transform the words into numbers and sentences into lists so it actually becomes a valid input.

In [5]:
X = cleaned_data['Text']
y = cleaned_data['Sentiment']


'''
Here, every sentence will become a one-hot vector, where each index reprensents a word
and is equal to 1 if it is in the sentence and 0 otherwise. The max_features is just
to give a cap to the total vocabulary, so some words that appear only once or twice in the entire dataset
are ignored.
'''
vec = CountVectorizer(max_features=4000)
X = vec.fit_transform(X).toarray()


'''
Using our understanding of PCA, we shall compress our 4000 features into 256
principal components. This prevents any overfitting that might occur. Of course,
you could change any of these values if you wanted to.
'''

pca = PCA(n_components=256)
X = pca.fit_transform(X)

'''
Finally, we can split our data, and actually do interesting ML stuff.
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)