References:

https://www.analyticsvidhya.com/blog/2018/07/hands-on-sentiment-analysis-dataset-python/

https://machinelearningmastery.com/clean-text-machine-learning-python/



In [0]:
# !pip install pandas
# !pip install numpy
# !pip install nltk
# !pip install gensim

In [0]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from wordcloud import WordCloud
import matplotlib.pyplot as plt



In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
directory='/content/gdrive/My Drive/Masters/DeepLearning/Project/Data'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
def cleanArticles(data):
  sentences = []
  data = data.str.replace("[^a-zA-Z#]", " ")
  for text in data:
    # remove special characters, numbers, punctuations
    
    cleanText = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(cleanText)
    # remove all tokens that are not alphabetic
    words = [word for word in tokens if word.isalpha()]
    # stemming of words
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    stop_words = stopwords.words('english')
    words = [w for w in stemmed if w not in stop_words and len(w)>2]
    sentences.append(words)
  return sentences

In [0]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt    

def cleanTweets(data):
  tweets = []
  # remove twitter handles (@user)
  combi = pd.Series(np.vectorize(remove_pattern)(data['text'], "@[\w]*"))
  # remove special characters, numbers, punctuations
  combi = combi.str.replace("[^a-zA-Z#]", " ")
  # remove short words
  combi = combi.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
  tokenized_tweet = combi.apply(lambda x: x.split())
  stemmer = PorterStemmer()
  tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
  for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
  combi = tokenized_tweet
  return combi

In [0]:
def wordCloud(data):
  all_words = ' '.join([text for text in data])
  wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

  plt.figure(figsize=(10, 7))
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis('off')
  plt.show()

In [0]:
comb_df = pd.read_csv(directory+"/Combined_News_DJIA.csv")
stock_df = pd.read_csv(directory+"/DJIA_table.csv")
reddit_df = pd.read_csv(directory+"/RedditNews.csv")
demo_df = pd.read_csv(directory+"/demonetization-tweets.csv", encoding = 'unicode_escape')

In [0]:
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

In [0]:
stock_df = stock_df.sort_values(by='Date',ascending=True)
stock_df = stock_df.reset_index()

In [0]:
percentage_change = [0]
for i in range(1,len(stock_df)):
  percentage_change.append(100*abs(stock_df['Open'][i]-stock_df['Open'][i-1])/stock_df['Open'][i-1])
median_change = np.median(percentage_change)
change = [0]
for i in range(1,len(percentage_change)):
  change.append(0 if percentage_change[i]< median_change else 1)
print(change.count(0))
print(change.count(1))

994
995


In [0]:
stock_df['Change'] = change

In [0]:
demo_df['cleanText'] = cleanTweets(demo_df).str.split(" ")

In [0]:
reddit_df['cleanNews'] = cleanArticles(reddit_df['News'])

In [0]:
reddit_df.head()

Unnamed: 0,Date,News,cleanNews
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,"[year, old, woman, mexico, citi, final, receiv..."
1,2016-07-01,IMF chief backs Athens as permanent Olympic host,"[imf, chief, back, athen, perman, olymp, host]"
2,2016-07-01,"The president of France says if Brexit won, so...","[presid, franc, say, brexit, donald, trump]"
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,"[british, man, must, give, polic, hour, notic,..."
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,"[nobel, laureat, urg, greenpeac, stop, oppos, ..."


In [0]:
stock_df['Date']=stock_df['Date'].astype(str)

In [0]:
reddit_df = pd.merge(reddit_df, stock_df[['Date', 'Close', 'Change']], on='Date')

In [0]:
stemmer = PorterStemmer()
stemmer.stem('employee')

'employe'

In [0]:
reddit_df['Date'] = pd.to_datetime(reddit_df['Date'])

In [0]:
reddit_df = reddit_df.sort_values(by='Date',ascending=False)

In [0]:
reddit_df.head()

Unnamed: 0,Date,News,cleanNews,Close,Change
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,"[year, old, woman, mexico, citi, final, receiv...",17949.369141,1
13,2016-07-01,"Iran celebrates American Human Rights Week, mo...","[iran, celebr, american, human, right, week, m...",17949.369141,1
1,2016-07-01,IMF chief backs Athens as permanent Olympic host,"[imf, chief, back, athen, perman, olymp, host]",17949.369141,1
24,2016-07-01,Ozone layer hole seems to be healing - US &amp...,"[ozon, layer, hole, seem, heal, amp, team, sho...",17949.369141,1
23,2016-07-01,A Hindu temple worker has been killed by three...,"[hindu, templ, worker, kill, three, men, motor...",17949.369141,1


In [0]:
(reddit_df['Change']!=0).sum()

24873

In [0]:
demo_df['isRetweet']=demo_df['isRetweet'].astype(int)
demo_df[['cleanText','isRetweet']].head()

Unnamed: 0,cleanText,isRetweet
0,"[critic, question, paytm, inform, about, #demo...",1
1,"[vote, #demonet, modi, survey]",1
2,"[former, finsec, governor, cbdt, chair, harvar...",1
3,"[gurugram, haryana, post, offic, employe, prov...",1
4,"[reddi, wed, cartoon, #demonet, #reddywed, htt...",1


In [0]:
reddit_df[['cleanNews','Change']].to_pickle(directory+'/reddit.pkl')
demo_df[['cleanText','isRetweet']].to_pickle(directory+'/demonetisation.pkl')

In [0]:
print(reddit_df.shape[0])
print(demo_df.shape[0])

49718
14940
