In [25]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from time import time
import pandas as pd
import nltk


In [26]:
fin_data_df = pd.read_csv("./datasets/final_data.csv")
fin_data_df.head()

Unnamed: 0,Headlines,sentiment,Unnamed: 2,Unnamed: 3,ds_score,sentiment_label
0,tiktok considers london locations headquarters,,,,0.0,Neutral
1,disney cuts ad spending facebook amid growing ...,,,,-0.4215,Negative
2,trail missing wirecard executive leads belarus...,,,,-0.296,Negative
3,twitter says attackers downloaded data eight n...,,,,-0.5719,Negative
4,us republicans seek liability protections coro...,,,,-0.6124,Negative


In [27]:
fin_data_df.shape

(58516, 6)

In [28]:
fin_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58516 entries, 0 to 58515
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Headlines        58515 non-null  object 
 1   sentiment        4846 non-null   object 
 2   Unnamed: 2       9 non-null      object 
 3   Unnamed: 3       2 non-null      object 
 4   ds_score         58516 non-null  float64
 5   sentiment_label  58516 non-null  object 
dtypes: float64(1), object(5)
memory usage: 2.7+ MB


In [29]:
fin_data_df.describe()

Unnamed: 0,ds_score
count,58516.0
mean,-0.009468
std,0.350087
min,-0.9432
25%,-0.25
50%,0.0
75%,0.2023
max,0.946


In [30]:
fin_data_df["Unnamed: 2"].notna().value_counts()

Unnamed: 2
False    58507
True         9
Name: count, dtype: int64

In [31]:
fin_data_df["Unnamed: 3"].notna().value_counts()

Unnamed: 3
False    58514
True         2
Name: count, dtype: int64

In [32]:
fin_data_df["sentiment"].notna().value_counts()

sentiment
False    53670
True      4846
Name: count, dtype: int64

## since there are more null values in the 3 columns we will drop these columns


In [33]:
fin_data_df.drop(columns=["Unnamed: 2","Unnamed: 3" , "sentiment"] , axis="columns" , inplace=True)

fin_data_df.head()

Unnamed: 0,Headlines,ds_score,sentiment_label
0,tiktok considers london locations headquarters,0.0,Neutral
1,disney cuts ad spending facebook amid growing ...,-0.4215,Negative
2,trail missing wirecard executive leads belarus...,-0.296,Negative
3,twitter says attackers downloaded data eight n...,-0.5719,Negative
4,us republicans seek liability protections coro...,-0.6124,Negative


## Preprocessing the text is a crucial point here.
#### - techniques used.
- **Lowercase** - It is necessary to convert the text to lower case as it is case sensitive.

- **Remove punctuations** - The punctuations present in the text do not add value to the data. The punctuation, when attached to any word, will create a problem in differentiating with other words. so we have to get rid of them.

- **Remove stopwords** - Stopwords include: I, he, she, and, but, was were, being, have, etc, which do not add meaning to the data. So these words must be removed which helps to reduce the features from our data. These are removed after tokenizing the text.

- **Stemming** - A technique that takes the word to its root form. It just removes suffixes from the words. The stemmed word might not be part of the dictionary, i.e it will not necessarily give meaning.

- **lemmatizing** - Takes the word to its root form called Lemma. It helps to bring words to their dictionary form. It is applied to nouns by default. It is more accurate as it uses more informed analysis to create groups of words with similar meanings based on the context, so it is complex and takes more time. This is used where we need to retain the contextual information.

In [None]:
#run this only once
#nltk.download('wordnet')

In [44]:
#import stopwords and text processing libraries
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [47]:
# Create a function for preprocessing 

def preprocess_text(text, stemming=True, lemmatizing=True):
        
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    
     
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    
    # Apply stemming if specified
    if stemming:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    
    # Apply lemmatizing if specified
    if lemmatizing:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the preprocessed words back into a sentence
    processed_text = ' '.join(words)
    
    return processed_text  

In [49]:
fin_data_df["Headlines"] = fin_data_df["Headlines"].fillna('').apply(preprocess_text)
fin_data_df.head()

Unnamed: 0,Headlines,ds_score,sentiment_label
0,tiktok consid london locat headquart,0.0,Neutral
1,disney cut ad spend facebook amid grow boycott...,-0.4215,Negative
2,trail miss wirecard execut lead belaru der spi...,-0.296,Negative
3,twitter say attack download data eight nonveri...,-0.5719,Negative
4,u republican seek liabil protect coronaviru ai...,-0.6124,Negative


In [50]:
sentences_corpus = fin_data_df["Headlines"].values
sentences_corpus

array(['tiktok consid london locat headquart',
       'disney cut ad spend facebook amid grow boycott wsj',
       'trail miss wirecard execut lead belaru der spiegel report', ...,
       'track dan loeb third point portfolio q3 2018 updat',
       'track william von mueffl cantillon capit manag portfolio q3 2018 updat',
       'agil launch new water qualiti analyz'], dtype=object)

In [51]:
sentences_corpus[:5]

array(['tiktok consid london locat headquart',
       'disney cut ad spend facebook amid grow boycott wsj',
       'trail miss wirecard execut lead belaru der spiegel report',
       'twitter say attack download data eight nonverifi account',
       'u republican seek liabil protect coronaviru aid battl loom'],
      dtype=object)

In [52]:
sent_tokenized = [nltk.word_tokenize(sent) for sent in sentences_corpus]
sent_tokenized[0]

['tiktok', 'consid', 'london', 'locat', 'headquart']

In [88]:
sent_tokenized.__sizeof__()

499944

### training the model 

In [69]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [89]:
fin_w2v_model = Word2Vec(vector_size=300, min_count=1)
fin_w2v_model.build_vocab(sent_tokenized , progress_per=1000)
total_examples = fin_w2v_model.corpus_count

In [77]:
fin_w2v_model.wv.vectors = google_model.vectors

In [90]:
fin_w2v_model.train(sent_tokenized , total_examples=fin_w2v_model.corpus_count , epochs=10)

(4863988, 5084500)

In [91]:
fin_w2v_model.cbow_mean

1

In [92]:
fin_w2v_model.save('fin_w2v_model.bin')