In [None]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import textblob
from textblob import TextBlob
import re
import numpy as np
import time
import seaborn as sns
from wordcloud import WordCloud
# nltk
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import stop_words


nltk.downloader.download('vader_lexicon')

In [None]:
finviz_url = 'https://finviz.com/quote.ashx?t='
Companies = ['AMZN', 'GOOG', 'FB','TWTR'] # stock companies

news_tables = {} # Declare empty dictionary to store results from finviz

In [None]:
for company in Companies:
    url = finviz_url + company # so it's going to loop first and get the company and url and then page and then comments from people after it will go back and take another company like GOOG
    req = Request(url=url, headers={'user-agent':'my-app'}) # Specify headers or else access will be denied
    response = urlopen(req)
    soup = BeautifulSoup(response,features='html.parser')
    news_table = soup.find(id='news-table')
    news_tables.update({company:news_table})


print(news_tables)


In [None]:
parsed_data = []

for company, news_table in news_tables.items(): # this will go through the keys we created above, company and texts
    for row in news_table.find_all('tr'): # the text in in tr, inorder to get all text we need to write find_all if we say find we will get  only tr
        comment = row.a.text
        date_data = row.td.text.split(' ')
        if len(date_data) == 1:
            time = date_data[0]
        else:
            date = date_data[0]
            time = date_data[1]
        parsed_data.append([company,date,time,comment])

        
print(parsed_data)

In [None]:

df = pd.DataFrame(parsed_data, columns=['Company','Date','Time','Comments'])
# Analyse your text
vader = SentimentIntensityAnalyzer()

print(df)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
scores = []
# Declare variables for scores
compound_list = []
positive_list = []
negative_list = []
neutral_list = []
for i in range(df['Comments'].shape[0]):
#print(analyser.polarity_scores(sentiments_pd['text'][i]))
    compound =vader.polarity_scores(df['Comments'][i])["compound"]
    pos = vader.polarity_scores(df['Comments'][i])["pos"]
    neu = vader.polarity_scores(df['Comments'][i])["neu"]
    neg = vader.polarity_scores(df['Comments'][i])["neg"]
    
    scores.append({"Compound": compound,
                       "Positive": pos,
                       "Negative": neg,
                       "Neutral": neu
                  })

In [None]:
print(scores)

In [None]:
sentiments_score = pd.DataFrame.from_dict(scores)
df = df.join(sentiments_score)
df.head()

In [None]:
df

In [None]:
#Collect the compound values for each news source
score_table = df.pivot_table(index='Company',  values="Neutral", aggfunc = np.mean)
score_table

In [None]:
score_table.plot(kind='bar')

In [None]:
#Collect the compound values for each news source
score_table = df.pivot_table(index='Company',  values="Compound", aggfunc = np.mean)
score_table

In [None]:
#plotting 
score_table.plot(kind='bar')

In [None]:
#Collect the negative values for each news source
neg_score_table = df.pivot_table(index='Company',  values="Negative", aggfunc = np.mean)
neg_score_table

In [None]:
#plotting 
neg_score_table.plot(kind='bar')

In [None]:
plt.figure(figsize=(6,8))
# Using groupby makes us to have one date entry
mean_df = df.groupby(['Company','Date']).mean()
#print(mean_df)

# Allow us to have date as x-axis
mean_df = mean_df.unstack()
# Remove compound column
mean_df = mean_df.xs('Compound', axis='columns').transpose()
mean_df.plot(kind='bar')
plt.show()
#print(mean_df)

In [None]:
df.Comments.str.split(expand=True).stack().value_counts()

In [None]:
df['Comments']=df['Comments'].str.lower()
df.tail()

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['Comments'] = df['Comments'].apply(lambda text: cleaning_stopwords(text))
df['Comments'].head()

In [None]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(Comments):
    translator = str.maketrans('', '', punctuations_list)
    return Comments.translate(translator)
df['Comments']= df['Comments'].apply(lambda x: cleaning_punctuations(x))
df['Comments'].tail()

In [None]:
def cleaning_repeating_char(Comments):
    return re.sub(r'(.)1+', r'1', Comments)
df['Comments'] = df['Comments'].apply(lambda x: cleaning_repeating_char(x))
df['Comments'].tail()

In [None]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
df['Comments'] = df['Comments'].apply(lambda x: cleaning_numbers(x))
df['Comments'].tail()

In [None]:
df

In [None]:
# Get independent variables as X
X = df.iloc[: , :-1].values
Y = df.iloc[: , -1].values
#print(Y)
print(X)
# print(df)


In [None]:
# Encoding the independent variables

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np 

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0,1,2,3])],remainder='passthrough') # Pass through will not encode the other columns. [3] represents index to encode
X = ct.fit_transform(X).toarray()

print(X)

In [None]:
# Splitting data into Training and Test set
# We need to train data so as to avoid over fitting and under fitting
# We want the results of our Training and Test data to match
# random_state controls the shuffling applied to the data before applying the split
# test_size should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
# print(x_train)
# print(x_test)
# print(y_train)
# print(y_test)

In [None]:
# Trainig the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,max_error,explained_variance_score

regressor = LinearRegression()
regressor.fit(x_train,y_train)

In [None]:
# Testing 
pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
joinedArrays = np.concatenate((pred.reshape(len(pred),1),y_test.reshape(len(y_test),1)),1) # Just 1 column. 0 = vertical axis and 1 = horizontal axis

In [None]:
# Visualising training results

plt.scatter(x_train[:,0],y_train, color = 'red')
plt.plot(x_train,regressor.predict(x_train),color = 'blue')
plt.title('Training Results')
plt.xlabel('Comments')
plt.ylabel('Compound')
plt.show() 

In [None]:
# Visualising test results

plt.scatter(x_test[:,0],y_test, color = 'red')
plt.plot(x_train,regressor.predict(x_train),color = 'blue')
plt.title('Test Results')
plt.xlabel('Comments')
plt.ylabel('Compound')
plt.show()

In [None]:
from sklearn import linear_model

ols = linear_model.LinearRegression()
model = ols.fit(X, Y)
model.coef_ # The linear regression coefficient can be accessed in a form of class attribute with model.coef_
model.intercept_ # The y-intercept can be accessed in a form of class attribute with model.intercept_
model.score(X, Y) # How good was your model? You can evaluate your model performance in a form of R-squared, with model.score(X, y). X is the features, and y is the response variable used to fit the model.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

In [None]:
coments_vectorizer = CountVectorizer(max_df = 0.90 , min_df = 2 , max_features = 1000,stop_words = 'english')
comments = coments_vectorizer.fit_transform(df['Comments'])
comments.shape

In [None]:
df=df.fillna(0) #replace all null values by 0
from sklearn.model_selection import train_test_split
F_train, F_test, m_train, m_test = train_test_split(comments, df['Compound'],
                                                    test_size=0.2, random_state=69)

In [None]:
print("F_train_shape : ",F_train.shape)
print("F_test_shape : ",F_test.shape)
print("m_train_shape : ",m_train.shape)
print("m_test_shape : ",m_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier

model_naive = MultinomialNB().fit(F_train, m_train) 
predicted_naive = model_naive.predict(F_test)

In [None]:
from sklearn.metrics import confusion_matrix

plt.figure(dpi=600)
mat = confusion_matrix(m_test, predicted_naive)
sns.heatmap(mat.T, annot=True, fmt='d', cbar=False)

plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.savefig("confusion_matrix.png")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

score_naive = accuracy_score(predicted_naive, m_test)
print("Accuracy with Naive-bayes: ",score_naive)