In [7]:
import pandas as pd
import re
import string
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split

In [12]:
nltk.download('stopwords')
nltk.download('punkt')
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           "]+", flags=re.UNICODE)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
df = pd.read_csv("Tweets.tsv", sep='\t')
df

Unnamed: 0,tweet,class
0,Happy Asoka Jayanti to all who celebrate! Let'...,2
1,Theres national shock when police fired water ...,0
2,I just joined Standing For Women. One of the m...,0
3,What Hollywood tour includes this monument?\n#...,1
4,"Fellow Kenyans, we must remember that without ...",2
...,...,...
837,#Pakistan don't have red line . Terrorists blo...,0
838,A lot of the time people forget that enslaved ...,0
839,Where are the European chads who just leave lo...,0
840,“Mainstream media and the political class shru...,0


In [14]:
def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove mentions and hashtags
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = tweet.replace('#', '')
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove emojis
    tweet = emoji_pattern.sub(r'', tweet)
    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    tweet = re.sub(r'[^\w\s]+', '', tweet)
    # Tokenize the tweet
    tokens = tokenizer.tokenize(tweet)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Join the tokens to form the preprocessed tweet
    preprocessed_tweet = ' '.join(tokens)
    return preprocessed_tweet

In [15]:
df['tweet'] = df.iloc[:, 0].apply(preprocess_tweet)
df

Unnamed: 0,tweet,class
0,Happy Asoka Jayanti celebrate Lets remember Em...,2
1,Theres national shock police fired water canno...,0
2,I joined Standing For Women One mratra violent...,0
3,What Hollywood tour includes monument guncultu...,1
4,Fellow Kenyans must remember without peace not...,2
...,...,...
837,Pakistan dont red line Terrorists blowing mosq...,0
838,A lot time people forget enslaved people Briti...,0
839,Where European chads leave lol I see European ...,0
840,Mainstream media political class shrugged rebe...,0


In [16]:
df.to_csv('Preprocessed_Tweets.tsv', sep='\t', index=False)

In [17]:
# Split the original DataFrame into train and test DataFrames, keeping equal proportions of labels
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['class'])

# Split the train DataFrame further into train and dev DataFrames, keeping equal proportions of labels
train_df, dev_df = train_test_split(train_df, test_size=0.125, stratify=train_df['class'])

# Print the sizes and label proportions of the three DataFrames
print(f'Train DataFrame: {len(train_df)}, label proportions: \n{train_df["class"].value_counts(normalize=True)}')
print(f'Dev DataFrame: {len(dev_df)}, label proportions: \n{dev_df["class"].value_counts(normalize=True)}')
print(f'Test DataFrame: {len(test_df)}, label proportions: \n{test_df["class"].value_counts(normalize=True)}')

Train DataFrame: 588, label proportions: 
0    0.510204
2    0.261905
1    0.227891
Name: class, dtype: float64
Dev DataFrame: 85, label proportions: 
0    0.517647
2    0.258824
1    0.223529
Name: class, dtype: float64
Test DataFrame: 169, label proportions: 
0    0.508876
2    0.266272
1    0.224852
Name: class, dtype: float64


In [19]:
train_df.to_csv('train.tsv', sep='\t', index=False)
test_df.to_csv('test.tsv', sep='\t', index=False)
dev_df.to_csv('dev.tsv', sep='\t', index=False)