# Task 1: Scraping Tweets

In [361]:
# Importing the necessary libraries
import snscrape.modules.twitter as twitter
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [362]:
# Method to scarp tweets from the given username
def scrap_tweets(username, total=1000):
    scraper = twitter.TwitterUserScraper(username)
    all_tweets = []
    for i, tweet in enumerate(scraper.get_items()):
        if i > total-1:
            break
        all_tweets.append(str(tweet.content))
    return all_tweets

In [363]:
# Scraping the tweets for SouthamptonFC

username = "SouthamptonFC"
total = 1000

all_tweets = scrap_tweets(username, total)

print("Total tweets:", len(all_tweets))

Total tweets: 1000


In [364]:
all_tweets[3]

'💼 #BarclaysWC\n💼 #PLCup\n\nPreview the next two #SaintsFC clashes on the horizon:'

In [365]:
# converting to dataframe and saving as csv
data = {"tweets": all_tweets}
df = pd.DataFrame(data)
df.to_csv(username + '_task1.csv')

In [366]:
df.head(10)

Unnamed: 0,tweets
0,C’mon you Saints! 💪 https://t.co/nCWN7YSmP3
1,2️⃣ days to go... ⏳\n\nWatch the #ThreeLions f...
2,Who wins the #FIFAWorldCup Golden Boot? 🤔\n\n#...
3,💼 #BarclaysWC\n💼 #PLCup\n\nPreview the next tw...
4,🎤 Potential lead singer?\n🚗 Stu the car salesm...
5,@shinyford We’ll cross that bridge when we com...
6,Ready to represent 🅰️ 🇩🇪 https://t.co/uywH6mmlT6
7,@JimmyJayMorgan 🔥
8,Who wins the #FIFAWorldCup? \n\nWe let the #Sa...
9,@premierleague 💪


# Task 2: Data Cleaning and Preparation

In [408]:
# reading the stored csv
username = "SouthamptonFC"
df2 = pd.read_csv(username + '_task1.csv')

In [440]:
# removing the stop word that was given in previous assignment
with open("stop_words.txt", "r") as f:
  stop_words = f.readlines()
  stop_words_list = []
  for i in stop_words:
    i = i.replace("\n", "")
    i = re.sub(r'[^\w\s]', '', i)
    stop_words_list.append(i)

# Applying the pre-process

def pre_process(df):
    l = []
    for i in df:
        s = re.sub(r"http\S+|'s|\n|#\S+|@\S+|[0-9]|[^\w\s]|_", "", i) # removing links, line breaks, removing tags, punctutations, numbers, single alphabets
        s = re.sub(r"\b[a-zA-Z]\b", "", s)
        s = s.lower()
        para = s.split(" ")
        para_copy = para.copy()
        for j in para_copy:
            if j in stop_words_list or len(j) <= 2:  # removing the stops words
                para.remove(j)

        para = " ".join(para)
        if len(para) != 0:
            para = para.lstrip()
            para = para.replace("  ", " ")
            l.append(para)
    
    return l

all_processed_tweets = pre_process(df2["tweets"])
print("Tweets after pre-processing:", len(all_processed_tweets))

Tweets after pre-processing: 865


In [441]:
all_processed_tweets[1]

'days watch comfort mary'

In [442]:
# converting to dataframe and saving as csv
data = {"tweets": all_processed_tweets}
df = pd.DataFrame(data)
df = df.dropna()
df.to_csv(username + '_task2.csv')

In [443]:
df.head(10)

Unnamed: 0,tweets
0,cmon saints
1,days watch comfort mary
2,wins golden boot women take pick
3,preview next two clashes horizon
4,potential lead singer stu car salesman future ...
5,well cross bridge come
6,ready represent
7,wins let players decide close
8,building future
9,get see mary soon ben


# Task 3: Feature Extraction - Bag of Words and Smoothing

In [444]:
# Splitting the tweets into train/ test

df = pd.read_csv(username + '_task2.csv')
train, test = train_test_split(df["tweets"], test_size=0.2)
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (692,)
Test shape: (173,)


In [446]:
# making vocabulary by getting unique words in training dataset
def make_vocabulary(df):
    l = []
    for each_tweet in df:
        words = each_tweet.split(" ")
        for each_word in words:
            if each_word not in l:
                l.append(each_word)
    return l
    
vocab = make_vocabulary(train)
print("Vocab:", len(vocab))

Vocab: 1245


In [452]:
# Creating bag of words for train and test

def make_bag_of_words(vocab, df):
    l = []
    for each_tweet in df:
        tweet_words = each_tweet.split(" ")
        temp = [1] * len(vocab)
        for vocab_word in vocab:
            if vocab_word in tweet_words:
                temp[vocab.index(vocab_word)] += 1
        l.append(temp)
    return l

bag_of_words_train = make_bag_of_words(vocab, train)
bag_of_words_test = make_bag_of_words(vocab, test)

In [462]:
print(bag_of_words_train[0])
print(bag_of_words_train[1])
print(bag_of_words_train[2])
print(bag_of_words_train[3])
print(bag_of_words_train[4])
print(bag_of_words_train[5])
print(bag_of_words_train[6])
print(bag_of_words_train[7])
print(bag_of_words_train[8])
print(bag_of_words_train[9])
print(bag_of_words_train[10])

[2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [463]:
print(bag_of_words_test[0])
print(bag_of_words_test[1])
print(bag_of_words_test[2])
print(bag_of_words_test[3])
print(bag_of_words_test[4])
print(bag_of_words_test[5])
print(bag_of_words_test[6])
print(bag_of_words_test[7])
print(bag_of_words_test[8])
print(bag_of_words_test[9])
print(bag_of_words_test[10])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 