# Import libaries

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import string
! pip install emoji
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle
from scipy.sparse import hstack




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
! pip install ekphrasis
import ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
hashtag_segmenter = TextPreProcessor(segmenter="twitter", unpack_hashtags=True)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...


# Load Dataset

In [38]:
# Take a look at the data
file_name = "./CL-II-MisinformationData - Sheet1.csv"
df = pd.read_csv(file_name)
df.head(5)

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,Populous states can generate large case counts...,real


In [39]:
df['label'] = df['label'].map({'real': 1, 'fake': 0})
df.head(5)

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,1
1,States reported 1121 deaths a small rise from ...,1
2,Politically Correct Woman (Almost) Uses Pandem...,0
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,Populous states can generate large case counts...,1


In [40]:
label_counts = df['label'].value_counts()
print(label_counts)

1    5545
0    5055
Name: label, dtype: int64


In [41]:
punct_set = set(string.punctuation + '''…'"`’”“'''  + '️')
stoplist = set(nltk.corpus.stopwords.words('english'))

In [42]:
def emoji_split(e, joiner = '\u200d',
                variation_selector=b'\xef\xb8\x8f'.decode('utf-8'),
                return_special_chars = False):
  parts = []
  for part in e:
    if part == joiner:
      if return_special_chars:
        parts.append(":joiner:")
    elif part == variation_selector:
      if return_special_chars:
        parts.append(":variation:")
    else:
      parts.append(part)
  return parts

In [43]:
def data_preprocessor(text):

    # Remove &amp;
    text = text.replace('&amp;', ' ')

    # Remove newline characters
    text = text.replace('\n', ' ')

    # extra space
    text = re.sub('\s+',' ',text)

    # lowercase
    text = text.lower()

    # tokenize
    tt = nltk.tokenize.TweetTokenizer()
    tokens = tt.tokenize(text)

    # Remove stopwords
    tokens = [token for token in tokens if token not in stoplist ]

    # Remove punctuation
    tokens = [token for token in tokens if token not in punct_set]

    # Process tokens
    updated_tokens = []
    lemmatizer = WordNetLemmatizer()

    for t in tokens:
        # Split emoji into components
        if t in emoji.EMOJI_DATA:
            updated_tokens += emoji_split(t)
        # Keep original hashtags and split them into words
        elif t.startswith('#'):
            updated_tokens += hashtag_segmenter.pre_process_doc(t).split(" ")
            # ['<hashtag>']  + hashtag_segmenter.pre_process_doc(t).split(" ") + ['</hashtag>']
        # Replace user mentions with <user>
        elif t.startswith('@'):
            updated_tokens.append('<user>')
        # Replace URLs with <url>
        elif t.startswith('http'):
            updated_tokens.append('<url>')
        else:
            # print(lemmatizer.lemmatize(t))
            updated_tokens.append(lemmatizer.lemmatize(t))

    # de-emojize
    updated_tokens = [emoji.demojize(token) for token in updated_tokens ]

    # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # updated_tokens = [lemmatizer.lemmatize(token) for token in updated_tokens]

    # print(updated_tokens)
    text = ' '.join(updated_tokens)

    # Replace variations of "co[non-alphanumeric]vid[non-alphanumeric]19" with "covid19"
    text = re.sub(r'co[^\w]*vid', 'covid', text)
    text = re.sub(r'covid[ー^\w]*19', 'covid19', text)

    # clean text
    # text = re.sub(r'\b[^a-zA-Z\s.]+\b', '', text)

    return text

In [44]:
df['preprocessed_tweet'] = df['tweet'].apply(data_preprocessor)
df.head(5)

Unnamed: 0,tweet,label,preprocessed_tweet
0,The CDC currently reports 99031 deaths. In gen...,1,cdc currently report 99031 death general discr...
1,States reported 1121 deaths a small rise from ...,1,state reported 1121 death small rise last tues...
2,Politically Correct Woman (Almost) Uses Pandem...,0,politically correct woman almost us pandemic e...
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1,india fights corona 1524 covid testing laborat...
4,Populous states can generate large case counts...,1,populous state generate large case count look ...


# Task 1

In [45]:
from sklearn.model_selection import train_test_split
import pandas as pd

X = df['preprocessed_tweet']
y = df['label']

In [46]:
# Split the data into training (80%), validation (10%), and test (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

In [47]:
# Create DataFrames for each split
train_df = pd.DataFrame({'tweet': X_train, 'label': y_train})
val_df = pd.DataFrame({'tweet': X_val, 'label': y_val})
test_df = pd.DataFrame({'tweet': X_test, 'label': y_test})

In [48]:
# Save the DataFrames to CSV files
train_df.to_csv('train_split.csv', index=False)
val_df.to_csv('val_split.csv', index=False)
test_df.to_csv('test_split.csv', index=False)

# Task 3

In [49]:
train_df

Unnamed: 0,tweet,label
1794,india fights corona india report 90000 recover...,1
5020,continuing good news daily reported death cont...,1
8129,• 3 people currently hospital one auckland cit...,1
8616,15/03 ncdc directly contacted twitter user men...,1
3838,india confirmed case count cross ten lakh main...,0
...,...,...
5734,netflix series secret terrius foretold pandemic,0
5191,madagascar covid 19 free april 28 registering ...,0
5390,news smoker wait pub reopen spend night standi...,0
860,<user> <user> per goi covid19 case home isolat...,1


In [50]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_train_vectors = tfidf_vectorizer.fit_transform(train_df['tweet'])
tfidf_train_vectors_array = tfidf_train_vectors.toarray()

# Transform test and validation sets using the same vectorizer
tfidf_test_vectors = tfidf_vectorizer.transform(test_df['tweet'])
tfidf_val_vectors = tfidf_vectorizer.transform(val_df['tweet'])


In [51]:
# Save TF-IDF vectors with labels
with open('tfidf_train_vectors_with_labels.pickle', 'wb') as f:
    pickle.dump((tfidf_train_vectors, train_df['label']), f)

with open('tfidf_test_vectors_with_labels.pickle', 'wb') as f:
    pickle.dump((tfidf_test_vectors, test_df['label']), f)

with open('tfidf_val_vectors_with_labels.pickle', 'wb') as f:
    pickle.dump((tfidf_val_vectors, val_df['label']), f)

In [53]:
tfidf_val_vectors.shape

(1060, 13704)