# **Task 1**

In [None]:
!pip install vaderSentiment

# **Importing The Libraries**

In [None]:
import pandas as pd
import numpy as np
import nltk
import io
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Reading the Dataset**

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset.csv',encoding = "ISO-8859-1")
data.head()

Unnamed: 0,Data,Date,Time,tweetcaption
0,tuesdayvibes,7/14/2020,7:00:21,Love it here vacation vibes amazing beautiful ...
1,realmeC11,7/14/2020,7:00:21,Best Camera Smartphone under 20k Please vote a...
2,KPSharmaOli,7/14/2020,7:00:21,shree Why should we have a problem with the pe...
3,RheaChakraborty,7/14/2020,7:00:21,Rhea Chakraborty s Heartbreaking Post On Susha...
4,Stop_Transfer_Sunita_Yadav,7/14/2020,7:00:21,We stand for Sunita Yadav Stop the Transfer Wh...


# **Cleaning the DATA**

In [6]:
Tweets = data["tweetcaption"]
lemmatizer = WordNetLemmatizer()

#Cleaning tweet function
def cleaning(sentence):
    sentence = re.sub('[^a-zA-Z]',' ',sentence)            #removing everything except the characters
    words = sentence.split()

    #Lemmatizing the words
    words = [lemmatizer.lemmatize(word) for word in  words if not word in stopwords.words('english')]
    sentence = ' '.join(words)
    return sentence

for i in range(len(Tweets)):
    Tweets[i] = cleaning(Tweets[i])
    

# **Using Vader to make the Scoring**

In [7]:
pos = []
neg = []
neu = []
compound = []

analyzer = SentimentIntensityAnalyzer()

#making the scores for each tweet
for i in range(len(Tweets)):
    scores = analyzer.polarity_scores(Tweets[i])
    pos.append(scores["pos"])
    neg.append(scores["neg"])
    neu.append(scores["neu"])
    compound.append(scores["compound"])

data["Compound Score"] = compound
data["Positivity"] = pos
data["Negativity"] = neg
data["Neutrality"] = neu

data.head()

Unnamed: 0,Data,Date,Time,tweetcaption,Compound Score,Positivity,Negativity,Neutrality
0,tuesdayvibes,7/14/2020,7:00:21,Love vacation vibe amazing beautiful cabo mexi...,0.979,0.191,0.054,0.755
1,realmeC11,7/14/2020,7:00:21,Best Camera Smartphone k Please vote help reac...,0.9313,0.134,0.031,0.835
2,KPSharmaOli,7/14/2020,7:00:21,shree Why problem people We problem Stupid Com...,0.5429,0.128,0.099,0.773
3,RheaChakraborty,7/14/2020,7:00:21,Rhea Chakraborty Heartbreaking Post On Sushant...,0.9259,0.196,0.13,0.675
4,Stop_Transfer_Sunita_Yadav,7/14/2020,7:00:21,We stand Sunita Yadav Stop Transfer Where woma...,0.1779,0.188,0.199,0.613


# ***Day wise Scoring***

 **Fixing the missing dates in the dataset**

In [8]:
#Since a lot of dates are missing ,i.e. only ###### are present on some dates
#Assuming the tweets are taken from all the days of the given 3 months
#A new date is assigned whenever time goes past 12 AM 

date = data["Date"]
time = data["Time"]

for i in range(len(date)):
    current_date = date[i]
    current_time = time[i]

    #Slash found means date is there in the dataset
    if(current_date[1] == '/'):
        continue
    
    #else there hash tags in the date column
    else:
        prev_time = time[i-1]         
        prev_month = date[i-1][0]                            #stores the month number of previous data 
        prev_day = date[i-1][2:4]                            #stores the date number of previous data
        if(current_time[0] == '0' and prev_time == '23' ):   #this means time has passed 12AM , therefore changing the date
            if(prev_day == "31"):                            #if prev_day = 31, means a change of month is required
                month = int(prev_month)+1
                date[i] = "{}/01/2020".format(month)         # 1st date of the next month is assigned

            else:
                month = int(prev_month)                      
                day = int(prev_day) + 1                      # only the day number is changed and month remains same
                date[i] = "{}/{}/2020".format(month,day)     

data["Date"] = date                                          # changing the "Date" Column with the updated dates

data.to_csv("/content/drive/MyDrive/Colab Notebooks/dataset_modified.csv",encoding = "ISO-8859-1")





# **Making Day-Wise Average Scores**

In [12]:
date = data["Date"]

#Initializing the variables
n = 1
new_date = []
day_pos = []
day_neg = []
day_neu = []
day_compound = []
temp_pos = pos[0]
temp_neg = neg[0]
temp_neu = neu[0]
temp_compound = compound[0]
new_date.append(date[0])

In [13]:

for i in range(1,len(date)):

    #for same date, add all the scores
    if(date[i] == date[i-1]):
        n = n+1                                         # keeps a count of the number of hashtags in a day
        temp_pos = temp_pos + pos[i]
        temp_neg = temp_neg + neg[i]
        temp_neu = temp_neu + neu[i]
        temp_compound = temp_compound + compound[i]

    else:
        #assigning the average scores of a day 
        day_pos.append(temp_pos/n)
        day_neg.append(temp_neg/n)
        day_neu.append(temp_neu/n)
        day_compound.append(temp_compound/n)
        
        #re-initializing the temporary variables
        temp_pos = pos[i]
        temp_neg = neg[i]
        temp_neu = neu[i]
        temp_compound = compound[i]
        n = 1
        new_date.append(date[i])

#the last date would have not been appended due to the boundary conditions
#so appending the last date average scores outside the loop
day_pos.append(temp_pos/n)
day_neg.append(temp_neg/n)
day_neu.append(temp_neu/n)
day_compound.append(temp_compound/n)

#Creating a dataframe of the day_wise_scores
df = {"Date":new_date, 
      "Compound Score":day_compound,
      "Positivity":day_pos,
      "Negativity":day_neg,
      "Neutrality":day_neu}

day_wise_data = pd.DataFrame(df)

print(day_wise_data.shape)
print(day_wise_data)


(56, 5)
         Date  Compound Score  Positivity  Negativity  Neutrality
0   7/14/2020        0.516471    0.191511    0.089447    0.719025
1   7/15/2020        0.563632    0.189055    0.079768    0.731176
2   7/16/2020        0.512249    0.196261    0.081423    0.722308
3   7/17/2020        0.533501    0.203100    0.082841    0.714035
4   7/18/2020        0.705279    0.208717    0.070612    0.720645
5   7/19/2020        0.546228    0.207766    0.086291    0.705926
6   7/20/2020        0.567151    0.211255    0.082932    0.705839
7   7/21/2020        0.388422    0.179284    0.100251    0.720498
8   7/22/2020        0.600390    0.211059    0.078552    0.710315
9   7/23/2020        0.673989    0.216112    0.071441    0.712408
10  7/24/2020        0.546054    0.211937    0.083701    0.704482
11  7/25/2020        0.588452    0.209074    0.081371    0.709559
12  7/26/2020        0.612271    0.206854    0.081801    0.711258
13  7/27/2020        0.709165    0.210640    0.071154    0.718209
14