<font color="green">**Jupyter notebook for vectorizing news data using Word2Vec Models.**</font>

In [1]:
import gensim.models.keyedvectors as word2vec
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np

**Load the Google's Word2Vec model**

In [2]:
model = word2vec.KeyedVectors.load_word2vec_format('D:\cmpe295a\Preprocessing\GoogleNews-vectors-negative300.bin', binary=True)

**Read processed news data from file.**

In [86]:
#company = "AMD"
#company = "Apple"
#company = "Disney"
company = "Tesla"
input_file = "D:\\cmpe295b\\news_data\\processed" + company + ".csv"
output_file = "D:\\cmpe295b\\news_data\\vector" + company + ".csv"

In [87]:
amddata = pd.read_csv(input_file)
amddata.head()

Unnamed: 0,date,news
0,2020-06-05,Tesla Elon Musk breakup Amazon tweet
1,2020-06-05,UPDATE 1Tesla Elon Musk breakup Amazon tweet
2,2020-06-04,Breakingviews Corona Capital ZoomInfo IPO U.S....
3,2020-06-04,Tesla Daily Has Joined Maven Coalition Of Inde...
4,2020-06-04,Germany rebuff gasoline auto lobby radical ele...


**!!!Do not run this.Run to following code only to create a file which has 2 new columns. One with words found in Googles word2vec model and other for words which were not found.**

In [5]:
def wordFound(headline):
    vector = []
    words = headline.split()
    for word in words:
        try:
            vec = model[word]
            vector.append(word)
        except:
            a=1
    return vector

def wordNotFound(headline):
    vector = []
    words = headline.split()
    for word in words:
        try:
            vec = model[word]
        except:
            vector.append(word)
    return vector
        
amddata['wordsFound'] = amddata['news'].apply(wordFound)
amddata['wordsNotFound'] = amddata['news'].apply(wordNotFound)
amddata.to_csv('D:\\cmpe295b\\news_data\\wordsAMD.csv',index=False)
amddata.head()

Unnamed: 0,date,news,wordsFound,wordsNotFound
0,2020-06-08,Nasdaq hit record high U.S. recession official,"[Nasdaq, hit, record, high, U.S., recession, o...",[]
1,2020-06-08,GRAPHICNasdaq hit record high U.S. recession o...,"[hit, record, high, U.S., recession, official]",[GRAPHICNasdaq]
2,2020-06-08,Trump say U.S. police disbanded,"[Trump, say, U.S., police, disbanded]",[]
3,2020-06-08,Swiss parliament pave way coronavirus tracing ...,"[Swiss, parliament, pave, way, coronavirus, tr...",[]
4,2020-06-08,Galeries Lafayette Champs Elysees store strugg...,"[Galeries, Lafayette, Champs, Elysees, store, ...",[$1]


**Generate vectors for each word and average them to create a final vector of the headline.**

In [88]:
def generateVector(news):
    vectors = []
    words = news.split()
    for word in words:
        try:
            vec = model[word]
            vectors.append(vec)
        except:
            a=1
    finalvec = np.mean(vectors,axis=0)
    return finalvec

amddata['vectors'] = amddata['news'].apply(generateVector)
amddata.head()

Unnamed: 0,date,news,vectors
0,2020-06-05,Tesla Elon Musk breakup Amazon tweet,"[0.0863444, 0.026041666, -0.15262859, 0.107747..."
1,2020-06-05,UPDATE 1Tesla Elon Musk breakup Amazon tweet,"[0.03491211, 0.0073649087, -0.19230652, 0.0602..."
2,2020-06-04,Breakingviews Corona Capital ZoomInfo IPO U.S....,"[0.034851074, 0.042681012, -0.09793527, 0.0008..."
3,2020-06-04,Tesla Daily Has Joined Maven Coalition Of Inde...,"[0.026048928, -0.04020691, -0.05104065, -0.047..."
4,2020-06-04,Germany rebuff gasoline auto lobby radical ele...,"[0.12018585, 0.09814453, 0.054229736, 0.054176..."


**Drop Nan rows**

In [89]:
print(len(amddata))
amddata = amddata.dropna()
print(len(amddata))
amddata.isnull().values.any()

556
556


False

**Create columns for each value in the vector**

In [90]:
def getVec(vector,index):
    return vector[index]
for i in range(300):
    newcol = 'vec'+str(i)
    amddata[newcol] = amddata['vectors'].apply(getVec,index=i)

In [91]:
amddata.head()

Unnamed: 0,date,news,vectors,vec0,vec1,vec2,vec3,vec4,vec5,vec6,...,vec290,vec291,vec292,vec293,vec294,vec295,vec296,vec297,vec298,vec299
0,2020-06-05,Tesla Elon Musk breakup Amazon tweet,"[0.0863444, 0.026041666, -0.15262859, 0.107747...",0.086344,0.026042,-0.152629,0.107747,-0.034897,0.001628,0.066604,...,0.171143,0.080729,-0.249146,0.002767,0.066788,-0.185465,0.079468,-0.00152,-0.076986,0.042053
1,2020-06-05,UPDATE 1Tesla Elon Musk breakup Amazon tweet,"[0.03491211, 0.0073649087, -0.19230652, 0.0602...",0.034912,0.007365,-0.192307,0.060221,0.030746,0.025513,0.005243,...,0.215983,0.002767,-0.265422,-0.024251,0.038793,-0.190918,0.082316,0.001898,0.012858,0.010579
2,2020-06-04,Breakingviews Corona Capital ZoomInfo IPO U.S....,"[0.034851074, 0.042681012, -0.09793527, 0.0008...",0.034851,0.042681,-0.097935,0.00082,-0.052979,-0.075108,0.175781,...,0.133333,-0.017665,-0.220594,0.111346,0.036307,0.087298,0.131138,0.093576,-0.010768,-0.00014
3,2020-06-04,Tesla Daily Has Joined Maven Coalition Of Inde...,"[0.026048928, -0.04020691, -0.05104065, -0.047...",0.026049,-0.040207,-0.051041,-0.047356,-0.034409,0.052124,0.055939,...,-0.007248,-0.028183,-0.133461,0.166817,-0.008614,-0.152161,-0.006435,-0.00226,0.043022,0.032959
4,2020-06-04,Germany rebuff gasoline auto lobby radical ele...,"[0.12018585, 0.09814453, 0.054229736, 0.054176...",0.120186,0.098145,0.05423,0.054176,-0.122589,-0.064209,-0.09137,...,-0.187469,-0.073517,-0.056267,-0.021881,-0.107574,0.00351,-0.020096,0.119736,0.119721,0.026733


**Write to vectors to file**

In [92]:
amddata.to_csv(output_file,index=False)