# <b> Sentiment Analysis of a WhatsApp Chat </b>

#### Importing required libraries:

In [1]:
import re
import pandas as pd
import nltk

#### Function which checks if the date and time format of the message is valid:

In [2]:
def check_datetime(s):

    #pattern of data and time which will be used in the txt file
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'

    #checking if the given format matches our standard format
    result = re.match(pattern, s)
    
    if result:
        return True
    return False

In [3]:
def check_contact(s):

    s = s.split(":")
    
    if len(s)==2:
        return True
    else:
        return False

#### Finding messages, using our defined functions to pre-process them correctly and returning the important information:

1. Date when message was sent
2. Time at which message was sent
3. Person who sent that message
4. Contents of the message

In [4]:
def extract_data_from_line(line):

    #splitting line and storing it in new variable splitline
    splitline = line.split(' - ')

    dateTime = splitline[0] #extracting first element which is date-time
    date, time = dateTime.split(", ") #seperating date and time

    #joining elements from the list starting from the second element, thus reconstructing the message part of the original string
    message = " ".join(splitline[1:]) 

    if check_contact(message):

        #if a contact is found, splits the message into contact and message content, and stores it
        splitmessage = message.split(": ")
        contact = splitmessage[0]
        message = " ".join(splitmessage[1:])
    else:
        contact = None
    
    return date, time, contact, message

#### Obtaining the data from the conversation and pre-processing it as required:

In [5]:
data = []
conversation = r"C:\Users\DELL\Downloads\WhatsApp Chat with Amit Sir Eng Coll.txt"

with open(conversation, encoding="utf-8") as fp:
    fp.readline()

    messageBuffer = []
    date, time, contact = None, None, None

    while True:
        line = fp.readline()

        if not line:
            break

        line = line.strip() #removing leading and trailing white spaces to avoid issues

        if check_datetime(line): #checking if the line contains a date-time stamp using our previously defined function

            if len(messageBuffer) > 0:
                data.append([date, time, contact, ' '.join(messageBuffer)])

            messageBuffer.clear()
            date, time, contact, message = extract_data_from_line(line)
            messageBuffer.append(message)
        
        else:
            messageBuffer.append(line)

#### Applying the sentiment intensity analyzer from the nltk package on each message of the conversation:

In [6]:
#converting data into a pandas dataframe to ease handling of data
df = pd.DataFrame(data, columns=["Date", 'Time', 'Contact', 'Message'])
df['Date'] = pd.to_datetime(df['Date'])

nltk.download('vader_lexicon')

#removing rows with missing values
data = df.dropna()

#removing rows wherever media was present and was omitted
data = data[data['Message'] != "<Media omitted>"]

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()

#using polarity_scores function from the SentimentIntensityAnalyzer to obtain positive, negative and neutral scores
data.loc[:, "Positive"] = data["Message"].apply(lambda x: sentiments.polarity_scores(x)["pos"])
data.loc[:, "Negative"] = data["Message"].apply(lambda x: sentiments.polarity_scores(x)["neg"])
data.loc[:, "Neutral"] = data["Message"].apply(lambda x: sentiments.polarity_scores(x)["neu"])

print(data.head(10))

         Date   Time            Contact  \
1  2021-02-22  19:52  Amit Sir Eng Coll   
2  2021-02-22  19:52       Ishan Pandit   
3  2021-02-22  19:55       Ishan Pandit   
4  2021-02-22  19:55  Amit Sir Eng Coll   
5  2021-02-22  19:55       Ishan Pandit   
6  2021-02-22  19:56  Amit Sir Eng Coll   
7  2021-02-22  19:56       Ishan Pandit   
8  2021-02-22  19:56       Ishan Pandit   
9  2021-02-22  19:57  Amit Sir Eng Coll   
10 2021-02-22  19:59       Ishan Pandit   

                                              Message  Positive  Negative  \
1                                                 Yes     1.000     0.000   
2   Matlab I would get full 4 marks for this right...     0.000     0.000   
3                     So where have I gone wrong sir?     0.000     0.383   
4   Standard diductions are there in writing skill...     0.000     0.000   
5                                    Oh okay okay sir     0.655     0.000   
6   Presentation, grammar, language all we be eval...     0.000 

  df['Date'] = pd.to_datetime(df['Date'])
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Summing up the scores for each message, calculating which score turns out to be the highest, and displaying the corresponding result!

In [7]:
x = sum(data["Positive"])
y = sum(data["Negative"])
z = sum(data["Neutral"])

print("Positive score is: ", x)
print("Negative score is: ", y)
print("Neutral score is: ", z)

def sentiment(a, b, c):
    if (a>b) and (a>c):
        print("\nYour chat is predominantly positive 😊 ")
    elif (b>a) and (b>c):
        print("\nYour chat is predominantly negative ☹️ ")
    else:
        print("\nYour chat is predominantly neutral 😐 ")
sentiment(x, y, z)

Positive score is:  6.944999999999999
Negative score is:  3.0539999999999994
Neutral score is:  32.0

Your chat is predominantly neutral 😐 


### Thus, we used the Sentiment Intensity Analyzer from the nltk (Natural Language Tool Kit) to check what was the predominant emotion/sentiment present in any given WhatsApp chat.

### We can copy paste the path to the txt file of any WhatsApp chat of our liking and run the program to check the major sentiment present in the chat!