In [1]:
import pandas as pd
import os

---

## If we need to combine files

In [22]:
topic1 = 'salvation'
topic2 = 'worship'

df1 = pd.read_csv(f'data/{topic1}.csv')
df1.head()

Unnamed: 0,text,date,label,label_name,id
0,Eternal Salvation\tIf Eternal Salvation is det...,2024-04-26,9,salvation,1783692672487608710
1,(Golden Harvest International School) — In a t...,2024-04-26,9,salvation,1783692668884734437
2,{{USERNAME}} {{USERNAME}} {{USERNAME}} {{USERN...,2024-04-26,9,salvation,1783692575506849828
3,"📕""The Lord is my strength and my song, and he ...",2024-04-26,9,salvation,1783692534318821696
4,{{USERNAME}} {{USERNAME}} {{USERNAME}} {{USERN...,2024-04-26,9,salvation,1783692517994594599


In [23]:
df2 = pd.read_csv(f'data/{topic2}.csv')
df2.head()

Unnamed: 0,text,date,label,label_name,id
0,{{USERNAME}} 6843107 🚮🎯🇷🇺🐥🇰🇭 All worship the r...,2024-04-26,10,worship,1783691054803476692
1,[Allkpop]\tUpvote + Stay for 2mins + Share\t1 ...,2024-04-26,10,worship,1783691052588863893
2,praise my body like the slut you are. {{URL}},2024-04-26,10,worship,1783691050676351079
3,Through the true devotion (satbhakti) taught b...,2024-04-26,10,worship,1783691049480904950
4,The sweetest of all sounds is praise.,2024-04-26,10,worship,1783691046943400296


In [24]:
concatenated_df = pd.concat([df1, df2], ignore_index=True)
concatenated_df = concatenated_df.drop_duplicates()
concatenated_df.shape

(3000, 5)

In [25]:
concatenated_df.head()

Unnamed: 0,text,date,label,label_name,id
0,Eternal Salvation\tIf Eternal Salvation is det...,2024-04-26,9,salvation,1783692672487608710
1,(Golden Harvest International School) — In a t...,2024-04-26,9,salvation,1783692668884734437
2,{{USERNAME}} {{USERNAME}} {{USERNAME}} {{USERN...,2024-04-26,9,salvation,1783692575506849828
3,"📕""The Lord is my strength and my song, and he ...",2024-04-26,9,salvation,1783692534318821696
4,{{USERNAME}} {{USERNAME}} {{USERNAME}} {{USERN...,2024-04-26,9,salvation,1783692517994594599


In [26]:
concatenated_df.to_csv('data/master.csv', mode='a', index=False, header=not os.path.exists('data/master.csv'))

In [27]:
pd.read_csv('data/master.csv').shape

(14997, 5)

---
# Remove highly offensive tweets

In [31]:
from transformers import pipeline

# Load the language model for text classification
classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment", device=0)

In [32]:
# Load your dataset into a DataFrame
df = pd.read_csv('data/master.csv')

# Function to classify text and return toxicity score
def classify_text(text):
    result = classifier(text)
    return result[0]['label'], result[0]['score']


In [None]:
# Apply the classification function to each row in the DataFrame
df['toxicity_label'], df['toxicity_score'] = zip(*df['text'].apply(classify_text))

df.to_csv('data/sentiment_master.csv')

In [35]:
filtered_df = df[df['toxicity_label'] != '5 stars']
print(filtered_df.columns)
print(filtered_df.toxicity_label.unique())
print(filtered_df.shape)

Index(['text', 'date', 'label', 'label_name', 'id', 'toxicity_label',
       'toxicity_score'],
      dtype='object')
['1 star' '3 stars' '4 stars' '2 stars']
(9070, 7)


In [36]:
filtered_df.drop(columns=['toxicity_label', 'toxicity_score'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['toxicity_label', 'toxicity_score'], inplace=True)


In [37]:
filtered_df

Unnamed: 0,text,date,label,label_name,id
0,{{USERNAME}} Fair enough! Honestly I can not i...,2024-04-26,2,bible,1783677542588727555
1,{{USERNAME}} {{USERNAME}} I hear ya. I wasn’t ...,2024-04-26,2,bible,1783677542005649587
3,{{USERNAME}} Holy reach #BUST,2024-04-26,2,bible,1783677527355174915
4,{{USERNAME}} I was out to dinner tonight and j...,2024-04-26,2,bible,1783677526901932435
6,{{USERNAME}} Holy shit,2024-04-26,2,bible,1783677526537064910
...,...,...,...,...,...
14991,{{USERNAME}} actually yer fuckin' dumb if you ...,2024-04-29,10,worship,1784994083045564674
14993,{{USERNAME}} Truly these people doesn't worshi...,2024-04-29,10,worship,1784994078301565166
14994,"{{USERNAME}} It is hard to watch, but it has b...",2024-04-29,10,worship,1784994075315409176
14995,{{USERNAME}} {{USERNAME}} Vitriol? And rudenes...,2024-04-29,10,worship,1784994074182758744


In [38]:
duplicate_rows = filtered_df[filtered_df.duplicated()]
df_no_duplicates = filtered_df.drop_duplicates()

print("Duplicate Rows:")
print(len(duplicate_rows))

print("\nDataFrame without Duplicates:")
print(len(df_no_duplicates))

Duplicate Rows:
0

DataFrame without Duplicates:
9070


In [39]:
df_no_duplicates.to_csv("data/cleaned_master.csv", index=False)

In [40]:
df = pd.read_csv("data/cleaned_master.csv")
df.head()

Unnamed: 0,text,date,label,label_name,id
0,{{USERNAME}} Fair enough! Honestly I can not i...,2024-04-26,2,bible,1783677542588727555
1,{{USERNAME}} {{USERNAME}} I hear ya. I wasn’t ...,2024-04-26,2,bible,1783677542005649587
2,{{USERNAME}} Holy reach #BUST,2024-04-26,2,bible,1783677527355174915
3,{{USERNAME}} I was out to dinner tonight and j...,2024-04-26,2,bible,1783677526901932435
4,{{USERNAME}} Holy shit,2024-04-26,2,bible,1783677526537064910


In [41]:
df.shape

(9070, 5)

In [42]:
df = df.drop_duplicates()
df.shape

(9070, 5)

In [43]:
df.label.unique()

array([ 2,  3,  4,  5,  6,  7,  1,  8,  9, 10], dtype=int64)

In [44]:
df.label_name.unique()

array(['bible', 'current_events', 'fellowship', 'Jerusalem', 'Jesus',
       'Kingdom', 'prayer', 'prophesy', 'salvation', 'worship'],
      dtype=object)

In [45]:
df.label.value_counts()

label
3     1299
5     1159
8      997
2      961
9      908
1      859
6      786
7      737
10     702
4      662
Name: count, dtype: int64