In [None]:
python -m spacy download en_core_web_sm

In [23]:
import os
import re
import parfit.parfit as pf

##Spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import spacy 
import en_core_web_sm
import string
from spacy.lang.en.stop_words import STOP_WORDS

import pandas as pd
import numpy as np
from numpy import asarray
from numpy import zeros
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

## preprocessing tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,ParameterGrid
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score, f1_score,make_scorer
from skopt import BayesSearchCV

## algorithm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

## tensorflow
# import tensorflow as tf
# from tensorflow.keras.datasets import imdb
# from keras.preprocessing.text import one_hot, Tokenizer
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import LSTM
# from tensorflow.keras.layers import Embedding
# from tensorflow.keras.preprocessing import sequence
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow import keras
# from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
# from keras.layers import Conv1D

#autenticating to google
# auth.authenticate_user()
# creds, _ = default()
# gc = gspread.authorize(creds)

nlp=spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')
nlp.pipe_names


df = pd.read_csv('stock_data.csv')
df = df.iloc[:,1:4]
df['Text'] = df['Text'].astype(str)
df['Date'] = pd.to_datetime(df['Date'])
df['Sentiment'] = df['Sentiment'].astype(np.int64)
print(df.info())

test_data = pd.read_csv('s1.csv')
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data= test_data.iloc[:,1:4]
test_data.rename(columns = {'Headline':'Text','Target':'Sentiment'}, inplace = True)
test_data['Text'] = test_data['Text'].astype(str)
test_data =test_data.replace(r'^\s*$', np.nan, regex=True)
test_data = test_data[['Text','Date']].merge(df, on=['Text','Date'], how='left')
test_data = test_data[test_data['Text'].str.contains("\?")==False]
test_data = test_data.loc[test_data['Sentiment'].isnull() == True][:500]
print(test_data.info())
print(df['Sentiment'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2887 entries, 0 to 2886
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       2887 non-null   object        
 1   Date       2887 non-null   datetime64[ns]
 2   Sentiment  2887 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 67.8+ KB
None




<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 2923 to 3456
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       500 non-null    object        
 1   Date       500 non-null    datetime64[ns]
 2   Sentiment  0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 15.6+ KB
None
 1    1486
-1     708
 0     693
Name: Sentiment, dtype: int64


In [24]:
punct = string.punctuation
stopwords = list(STOP_WORDS)
def text_data_cleaning(sentence):
    sent = preprocess_text(sentence)
    doc = nlp(sent)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return append_message(cleaned_tokens)
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    sentence = sen.lower()

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) 
    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  
    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence


def append_message(text):
  str = " "
  return (str.join(text))


df['Tok_text'] = df['Text'].apply(preprocess_text)

tf_idf_vect = TfidfVectorizer()
X = df['Tok_text']
y = df['Sentiment']
# X = tf_idf_vect.fit_transform(X)

##splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train = tf_idf_vect.fit_transform(X_train)
X_test = tf_idf_vect.transform(X_test)
print(X_train.shape, X_test.shape)
## test data preprocessing
test_data['Tok_text'] = test_data['Text'].apply(preprocess_text)
test = test_data['Tok_text']
x_test = tf_idf_vect.transform(test_data['Tok_text'])
print(x_test.shape)

(2309, 3690) (578, 3690)
(500, 3690)


In [22]:
score = [0]

classifier = SGDClassifier()
param_grid = {
        'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate # number of epochs
        'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber'], # logistic regression,
        'penalty': ['l2'],
        'n_jobs': [-1]
    }

grid1 = GridSearchCV(classifier, param_grid, refit = True, verbose = 0)
    # fitting the model for grid search
grid1.fit(X_train, y_train)
print("For SDG classification")
score.append(grid1.best_score_)
print('Best Score: %s' % grid1.best_score_)
    # print best parameter after tuning
print(grid1.best_params_)
    # print how our model looks after hyper-parameter tuning
print(grid1.best_estimator_)

classifier = LogisticRegression()
param_grid = {
        'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 1e0],
        'penalty': ['l2'],
        'n_jobs': [-1],
        'multi_class': ['multinomial'],
        'solver': ['lbfgs']
    }
grid2 = GridSearchCV(classifier, param_grid, refit = True, verbose = 0)
    # fitting the model for grid search
grid2.fit(X_train, y_train)
score.append(grid2.best_score_)
print("For Logistic Regression classification")
print('Best Score: %s' % grid2.best_score_)
    # print best parameter after tuning
print(grid2.best_params_)
    # print how our model looks after hyper-parameter tuning
print(grid2.best_estimator_)

# classifier = RandomForestClassifier()
# param_grid = {
#         'min_samples_leaf': [1,3,5,10,30,45,50],
#         'max_features': ['sqrt', 'log2', 0.4, 0.5,],
#         'n_estimators': [100],
#         'n_jobs': [-1],
#         'random_state': [42]
#     }
# grid3 = GridSearchCV(classifier, param_grid, refit = True, verbose = 0)
#     # fitting the model for grid search
# grid3.fit(X_train, y_train)
# print("For Random forest classification")
# score.append(grid3.best_score_)
# print('Best Score: %s' % grid3.best_score_)
#     # print best parameter after tuning
# print(grid3.best_params_)
#     # print how our model looks after hyper-parameter tuning
# print(grid3.best_estimator_)

classifier = SVC() 
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001,"auto"],
              'kernel': ['rbf'],
              } 
  
grid4 = GridSearchCV(classifier, param_grid, refit = True, verbose = 0)
# fitting the model for grid search
grid4.fit(X_train, y_train)
print("For SVM classification")
score.append(grid4.best_score_)
print('Best Score: %s' % grid4.best_score_)
# print best parameter after tuning
print(grid4.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid4.best_estimator_)



num = score.index(max(score))
if num == 1:
    y_pred = grid1.predict(X_test)
    print("For SDG classification")
    print(classification_report(y_test, y_pred))
    y_pred = grid1.predict(x_test)
elif num == 2:
    y_pred = grid2.predict(X_test)
    print("For Logistic Regression classification")
    print(classification_report(y_test, y_pred))
    y_pred = grid2.predict(x_test)
# elif num == 3:
#     y_pred = grid3.predict(X_test)
#     print(classification_report(y_test, y_pred))
#     y_pred = grid3.predict(x_test)
else:
    y_pred = grid4.predict(X_test)
    print("For SVM classification")
    print(classification_report(y_test, y_pred))
    y_pred = grid4.predict(x_test)
check = pd.DataFrame({'Text':test_data['Text'],'Date':test_data['Date'],'Sentiment':y_pred})
check.to_csv('check.csv')

For SDG classification
Best Score: 0.7903907372454009
{'alpha': 0.001, 'loss': 'modified_huber', 'n_jobs': -1, 'penalty': 'l2'}
SGDClassifier(alpha=0.001, loss='modified_huber', n_jobs=-1)
For Logistic Regression classification
Best Score: 0.781295132922031
{'C': 10, 'multi_class': 'multinomial', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'lbfgs'}
LogisticRegression(C=10, multi_class='multinomial', n_jobs=-1)
For SVM classification
Best Score: 0.7856250762975275
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=10, gamma=0.1)
For SDG classification
              precision    recall  f1-score   support

          -1       0.79      0.81      0.80       140
           0       0.76      0.55      0.64       150
           1       0.79      0.89      0.83       288

    accuracy                           0.78       578
   macro avg       0.78      0.75      0.76       578
weighted avg       0.78      0.78      0.77       578



In [19]:
df1 = pd.read_csv('check.csv')
df1.drop(columns=['Unnamed: 0'], axis=1,  inplace=True)
df1.rename(columns={'0' : 'Sentiment'},inplace=True)
index_name = df1[df1['Sentiment'].isna()==True].index
df1.drop(index_name,inplace=True)
print(df1.info())
print(df.info())
# df.drop(columns=['Tok_text'], axis=1,  inplace=True)
fg = df.append(df1,ignore_index=True)
fg.to_csv('stock_data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       500 non-null    object
 1   Date       500 non-null    object
 2   Sentiment  500 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 11.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2387 entries, 0 to 2386
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       2387 non-null   object        
 1   Date       2387 non-null   datetime64[ns]
 2   Sentiment  2387 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 56.1+ KB
None


  fg = df.append(df1,ignore_index=True)


In [32]:
# test_data = pd.read_csv('deb1.csv')
# test_data['Date'] = pd.to_datetime(test_data['Date'])
# test_data= test_data.iloc[:,1:4]
# test_data.rename(columns = {'Headline':'Text','Target':'Sentiment'}, inplace = True)
# test_data['Text'] = test_data['Text'].astype(str)
# test_data =test_data.replace(r'^\s*$', np.nan, regex=True)
# test_data = test_data[['Text','Date']].merge(df, on=['Text','Date'], how='left')
# test_data = test_data[test_data['Text'].str.contains("\?")==False]
# test_data = test_data.loc[test_data['Sentiment'].isnull() == True][:100]
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       100 non-null    object        
 1   Date       100 non-null    datetime64[ns]
 2   Sentiment  0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 3.1+ KB


In [39]:
# test_data = pd.read_csv('soh1.csv')
# test_data['Date'] = pd.to_datetime(test_data['Date'])
# test_data= test_data.iloc[:,1:4]
# test_data.rename(columns = {'Headline':'Text','Target':'Sentiment'}, inplace = True)
# test_data['Text'] = test_data['Text'].astype(str)
# test_data =test_data.replace(r'^\s*$', np.nan, regex=True)
# test_data = test_data[['Text','Date']].merge(df, on=['Text','Date'], how='left')
# test_data = test_data[test_data['Text'].str.contains("\?")==False]
# test_data = test_data.loc[test_data['Sentiment'].isnull() == True]
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       4000 non-null   object        
 1   Date       4000 non-null   datetime64[ns]
 2   Sentiment  0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 125.0+ KB


In [34]:
# test_data = pd.read_csv('say1.csv')
# test_data['Date'] = pd.to_datetime(test_data['Date'])
# test_data= test_data.iloc[:,1:4]
# test_data.rename(columns = {'Headline':'Text','Target':'Sentiment'}, inplace = True)
# test_data['Text'] = test_data['Text'].astype(str)
# test_data =test_data.replace(r'^\s*$', np.nan, regex=True)
# test_data = test_data[['Text','Date']].merge(df, on=['Text','Date'], how='left')
# test_data = test_data[test_data['Text'].str.contains("\?")==False]
# test_data = test_data.loc[test_data['Sentiment'].isnull() == True][:100]
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       100 non-null    object        
 1   Date       100 non-null    datetime64[ns]
 2   Sentiment  0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 3.1+ KB


In [70]:
y_pred = grid.predict(x_test)
check = pd.DataFrame({'Text':test_data['Text'],'Date':test_data['Date'],'Sentiment':y_pred})
check.to_csv('check.csv')

Using CNN

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       100 non-null    object
 1   Date       100 non-null    object
 2   Sentiment  100 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2187 entries, 0 to 2186
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       2187 non-null   object        
 1   Date       2187 non-null   datetime64[ns]
 2   Sentiment  2187 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 51.4+ KB
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2387 entries, 0 to 2386
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       2387 non-null   object
 1   Date       2387 non-null   object
 2   Sentiment  2387 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 56.1+ KB


  fg = df.append(df1,ignore_index=True)


In [24]:
# df2 = test_data[['Text','Date']].merge(df1, on=['Text'], how='left')
# index_name = df2[df2['Sentiment'].isna()==True].index
# df2.drop(index_name,inplace=True)
# df2.head(30)

In [None]:
# pos_msg = df[df['Sentiment'] == 1]
# zeo_msg = df[df['Sentiment']==0]
# nrg_msg = df[df['Sentiment']==-1]
# pos_msg_text = " ".join(pos_msg.Tok_text.to_numpy().tolist())
# zeo_msg_text = " ".join(zeo_msg.Tok_text.to_numpy().tolist())
# nrg_msg_text = " ".join(nrg_msg.Tok_text.to_numpy().tolist())

# pos_msg_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(pos_msg_text)
# plt.figure(figsize=(16,10))
# plt.imshow(pos_msg_cloud, interpolation='bilinear')
# plt.axis('off') # turn off axis
# plt.show()
# zeo_msg_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(zeo_msg_text)
# plt.figure(figsize=(16,10))
# plt.imshow(zeo_msg_cloud, interpolation='bilinear')
# plt.axis('off') # turn off axis
# plt.show()
# nrg_msg_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(nrg_msg_text)
# plt.figure(figsize=(16,10))
# plt.imshow(nrg_msg_cloud, interpolation='bilinear')
# plt.axis('off') # turn off axis
# plt.show()


In [None]:
# fix random seed for reproducibility
jj
tf.random.set_seed(7)

# load the dataset but only keep the top n words, zero the rest

top_words = 5000
#(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

X = []
y = df['Sentiment']
sentences = list(df['Text'])
for sen in sentences:
    X.append(preprocess_text(sen))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)


vocab_length = len(word_tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


embeddings_dictionary = dict()
glove_file = open('/content/drive/My Drive/Colab Notebooks/a2_glove.6B.100d.txt', encoding="utf8")#/content/drive/MyDrive/Colab Notebooks/a2_glove.6B.100d.txt

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()


embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

print(embedding_matrix.shape)

In [None]:
# Neural Network architecture

cnn_model = Sequential()

embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
cnn_model.add(embedding_layer)

cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))

In [None]:
# Model compiling

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(cnn_model.summary())

In [None]:
cnn_model_history = cnn_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
score = cnn_model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
def build_model(hp):
  model = keras.Sequential()
  model.add(keras.layers.Dense(
      hp.Choice('units', [8, 16, 32]),
      activation='relu'))
  model.add(keras.layers.Dense(1, activation='relu'))
  model.compile(loss='mse')
  return model

In [None]:
tuner = keras_tuner.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5)

In [None]:
tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test))
best_model = tuner.get_best_models()[0]

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.0002, 
    epsilon=1e-08, 
    clipnorm=1.0),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
    )

In [None]:
model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test))