In [4]:
# Create a bag of words for sentences in the file 'Corpus.txt'
# Filter .!? etc. from the sentences

import pandas as pd
import re

file = open('Corpus.txt', 'r')
text = file.read()

sentences = text.split('.')

corpus = {}

pattern = re.compile(r"([-\s.,;!?])+")

for i, sentence in enumerate(sentences):
    tokens = pattern.split(sentence)
    tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in tokens)

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print(df.shape)
df

(5, 63)


Unnamed: 0,Natural,language,processing,(NLP),is,a,subfield,of,linguistics,computer,...,categorize,organize,themselves,Challenges,frequently,involve,speech,recognition,understanding,generation
sent0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent1,0,1,0,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
sent3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,1
sent4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# calculate overlap of each sentence in the file 'Corpus.txt'

for index1, row1 in df.iterrows():
    for index2, row2 in df.iterrows():
        if index1 != index2:
            print(index1, 'vs', index2)
            print(row1.dot(row2))


(5, 48)
sent0 vs sent1
2
sent0 vs sent2
0
sent0 vs sent3
3
sent0 vs sent4
0
sent1 vs sent0
2
sent1 vs sent2
2
sent1 vs sent3
1
sent1 vs sent4
0
sent2 vs sent0
0
sent2 vs sent1
2
sent2 vs sent3
0
sent2 vs sent4
0
sent3 vs sent0
3
sent3 vs sent1
1
sent3 vs sent2
0
sent3 vs sent4
0
sent4 vs sent0
0
sent4 vs sent1
0
sent4 vs sent2
0
sent4 vs sent3
0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kacper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# do the same as in previous task, but now remove stop words from the sentences

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

for i, sentence in enumerate(sentences):
    tokens = pattern.split(sentence)
    tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
    tokens = [x for x in tokens if x not in stop_words]
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in tokens)

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print(df.shape)
df

for index1, row1 in df.iterrows():
    for index2, row2 in df.iterrows():
        if index1 <= index2:
            print(index1, 'vs', index2)
            print(row1.dot(row2))




(5, 48)
sent0 vs sent0
22
sent0 vs sent1
2
sent0 vs sent2
0
sent0 vs sent3
3
sent0 vs sent4
0
sent1 vs sent1
12
sent1 vs sent2
2
sent1 vs sent3
1
sent1 vs sent4
0
sent2 vs sent2
11
sent2 vs sent3
0
sent2 vs sent4
0
sent3 vs sent3
10
sent3 vs sent4
0
sent4 vs sent4
0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kacper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# do as above, but using Porter stemmer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for i, sentence in enumerate(sentences):
    tokens = pattern.split(sentence)
    tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
    tokens = [x for x in tokens if x not in stop_words]
    tokens = [stemmer.stem(x) for x in tokens]
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in tokens)

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print(df.shape)
df

for index1, row1 in df.iterrows():
    for index2, row2 in df.iterrows():
        if index1 <= index2:
            print(index1, 'vs', index2)
            print(row1.dot(row2))

(5, 45)
sent0 vs sent0
19
sent0 vs sent1
2
sent0 vs sent2
0
sent0 vs sent3
3
sent0 vs sent4
0
sent1 vs sent1
12
sent1 vs sent2
2
sent1 vs sent3
1
sent1 vs sent4
0
sent2 vs sent2
11
sent2 vs sent3
0
sent2 vs sent4
0
sent3 vs sent3
10
sent3 vs sent4
0
sent4 vs sent4
0


In [29]:
# Read 'TweeterSentiments.txt' and use VADER to determine sentiments. Compare with sentiments in the file. You can for examole calculate average difference per sentiment.

# the second column contains the sentiment, the third column contains the tweet

import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

file = open('TweeterSentiments.txt', 'r')
text = file.read()

sa = SentimentIntensityAnalyzer()

# read the file into a dataframe
tweets = pd.read_csv('TweeterSentiments.txt', sep='\t', header=None, names=['id', 'sentiment', 'tweet'])

print(tweets.head())

vader_scores = [sa.polarity_scores(tweet) for tweet in tweets['tweet']]
tweets['vader_scores'] = vader_scores
tweets['diffeence'] = (tweets['sentiment']) - (tweets['sentiment'] - tweets['vader_scores']).abs()

print('averaged difference per sentiment')


  df.groupby('sentiment').mean()['diff']


sentiment
-0.010638298   -0.345438
-0.010752688    0.209447
-0.021052632   -0.967553
-0.021276596    0.400223
-0.030927835    0.639572
                  ...   
3.445652174     2.951752
3.45            3.028500
3.47311828      2.539018
3.489583333     2.621183
3.55            2.757500
Name: diff, Length: 968, dtype: float64