In [1]:
import pandas as pd
import re
import string
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           "]+", flags=re.UNICODE)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
df = pd.read_csv("new_tweets.tsv", sep='\t')
df

Unnamed: 0,tweet,class
0,"Violence only begets more violence, and no one...",1
1,quando bate a crise existencial,1
2,"Currently, the Party is forcing the implementa...",0
3,"One of the agents said, Hezbollah always maint...",0
4,"One will protect you, the others will try to k...",0
...,...,...
2625,Every time some Koranimal murders innocents th...,0
2626,@BarackObama can suck my dick #NiggerHitler,0
2627,My mother just got punched in the head by a br...,0
2628,N*glets destroy things as they go that's their...,0


In [44]:
def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove mentions and hashtags
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = tweet.replace('#', '')
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove emojis
    tweet = emoji_pattern.sub(r'', tweet)
    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    tweet = re.sub(r'[^\w\s]+', '', tweet)
#     # Tokenize the tweet
#     tokens = tokenizer.tokenize(tweet)
#     # Remove stop words
#     tokens = [token for token in tokens if token not in stop_words]
#     # Join the tokens to form the preprocessed tweet
#     preprocessed_tweet = ' '.join(tokens)
    return tweet

In [45]:
df['tweet'] = df.iloc[:, 0].apply(preprocess_tweet)
df

Unnamed: 0,tweet,class
0,Violence only begets more violence and no one ...,1
1,quando bate a crise existencial,1
2,Currently the Party is forcing the implementat...,0
3,One of the agents said Hezbollah always mainta...,0
4,One will protect you the others will try to ki...,0
...,...,...
2625,Every time some Koranimal murders innocents th...,0
2626,can suck my dick NiggerHitler,0
2627,My mother just got punched in the head by a br...,0
2628,Nglets destroy things as they go thats their p...,0


In [48]:
df.to_csv('Preprocessed_Tweets.tsv', sep='\t', index=False)

In [49]:
# Split the original DataFrame into train and test DataFrames, keeping equal proportions of labels
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['class'])

# Split the train DataFrame further into train and dev DataFrames, keeping equal proportions of labels
train_df, dev_df = train_test_split(train_df, test_size=0.125, stratify=train_df['class'])

# Print the sizes and label proportions of the three DataFrames
print(f'Train DataFrame: {len(train_df)}, label proportions: \n{train_df["class"].value_counts(normalize=True)}')
print(f'Dev DataFrame: {len(dev_df)}, label proportions: \n{dev_df["class"].value_counts(normalize=True)}')
print(f'Test DataFrame: {len(test_df)}, label proportions: \n{test_df["class"].value_counts(normalize=True)}')

Train DataFrame: 1841, label proportions: 
1    0.602933
0    0.397067
Name: class, dtype: float64
Dev DataFrame: 263, label proportions: 
1    0.604563
0    0.395437
Name: class, dtype: float64
Test DataFrame: 526, label proportions: 
1    0.602662
0    0.397338
Name: class, dtype: float64


In [50]:
train_df.to_csv('train.tsv', sep='\t', index=False)
test_df.to_csv('test.tsv', sep='\t', index=False)
dev_df.to_csv('dev.tsv', sep='\t', index=False)