In [83]:
import pandas as pd
from scipy.stats import mode

# Clean Answer

In [84]:
# Load the data
data = pd.read_csv('tweets_filtered.csv')
data = data.iloc[0:5000, :]
data.to_csv('tweets_filtered_top_5000.csv', index=False)

#### Gpt Informado

In [85]:
folder = 'raw_answers/'
file = 'gpt/gpt_informado.csv'

df = pd.read_csv(folder + file)

# Get only int ids
df = df[df['ID'].apply(lambda x: x.isnumeric())]

# Get first answer from each ID
df = df.groupby('ID').first().reset_index()


df['ID'] = df['ID'].astype(int)

df = df.sort_values('ID').reset_index(drop=True)

df['Sports Betting'] = df['Sports Betting'].apply(lambda x: 1 if 'yes' in x else 0)

df = df.rename(columns={'Sports Betting': 'gpt_informado'})


df.to_csv('clean_answers/gpt/gpt_informado.csv', index=False)

#### Gpt Não-Informado

In [111]:
folder = 'raw_answers/'
file = 'gpt/gpt_nao_informado.csv'

df = pd.read_csv(folder + file)

df['ID'] = df['ID'].astype(str)

# Get only int ids
df = df[df['ID'].apply(lambda x: x.isnumeric())]

df['Mentions Sports Betting'] = df['Mentions Sports Betting'].apply(lambda x: 1 if 'yes' in x else 0)


In [112]:
df

Unnamed: 0,ID,Tweet Content,Mentions Sports Betting
0,1,"""𝐈𝐭. 𝐂𝐚𝐧𝐧𝐨𝐭. 𝐆𝐞𝐭. 𝐁𝐢𝐠𝐠𝐞𝐫. 𝐓𝐡𝐚𝐧. 𝐓𝐡𝐢𝐬. 🔥\r\n\r\...",no
1,2,"""It’ll be a tough night for Europe today.\r\n\...",no
2,3,"""In defeat or in victory, always say Alhamduli...",no
3,4,"""FAFC Genesis Edition ( This collection have m...",no
4,5,"""Get ready for zabardast action on #25th Jan ....",no
...,...,...,...
4995,4996,"""If you wave Palestinian flags at the #FIFAWor...",no
4996,4997,"""FAFC Genesis Edition ( This collection have m...",no
4997,4998,"""@betcoza D is the Goal\r\n#BetcozaSpotTheGoal...",yes
4998,4999,"""France won the match against Morocco at #FIFA...",no


#### Llama3 Informado

In [87]:
folder = 'raw_answers/'
file = 'llama3/classificacoes_informadas_llama3.txt'

# Read every line and store in a list
with open(folder + file, 'r') as f:
    lines = f.readlines()

# Store in dataframe with id 1 to 5000
df = pd.DataFrame({'ID': range(1, 5001), 'classificacao': lines})

# Convert all to lower
df['classificacao'] = df['classificacao'].apply(lambda x: x.lower())

# If it has a yes in the string, it is a 1, otherwise 0
df['classificacao'] = df['classificacao'].apply(lambda x: 1 if 'yes' in x else 0)

df = df.rename(columns={'classificacao': 'llama_informado'})

df.to_csv('clean_answers/llama/classificacoes_informadas_llama3.csv', index=False)


#### Llama3 Não-Informado

In [88]:
folder = 'raw_answers/'
file = 'llama3/classificacoes_nao_informadas_llama3.txt'

# Read every line and store in a list
with open(folder + file, 'r') as f:
    lines = f.readlines()

# Store in dataframe with id 1 to 5000
df = pd.DataFrame({'ID': range(1, 5001), 'classificacao': lines})

# Convert all to lower
df['classificacao'] = df['classificacao'].apply(lambda x: x.lower())

# If it has a yes in the string, it is a 1, otherwise 0
df['classificacao'] = df['classificacao'].apply(lambda x: 1 if 'yes' in x else 0)

df = df.rename(columns={'classificacao': 'llama_nao_informado'})

df.to_csv('clean_answers/llama/classificacoes_nao_informadas_llama3.csv', index=False)


# Join Answers

#### Gpt Informado

In [93]:
def read_csv_files(csv_files):
    dfs = [pd.read_csv('clean_answers/'+ file) for file in csv_files]
    combined_df = pd.concat(dfs, axis=1, join='inner')
    return combined_df.drop('ID', axis=1)

# Lista de arquivos CSV
folder = 'clean_answers/'
csv_files = ['gpt/gpt_informado.csv', 
             #'gpt/gpt_nao_informado.csv', 
              'llama/classificacoes_informadas_llama3.csv', 'llama/classificacoes_nao_informadas_llama3.csv']

final_df = read_csv_files(csv_files)

# Get stats from final vote

In [104]:
sample

Unnamed: 0,final_vote
1501,1.0
2586,1.0
2653,1.0
1055,1.0
705,1.0
...,...
4740,0.0
2940,1.0
3456,1.0
373,1.0


In [96]:
sample = pd.read_csv('final_classified_tweets.csv').sample(100, random_state=42)

In [97]:
tweets = pd.read_csv('tweets_filtered.csv')

In [98]:
tweets_samples = tweets.loc[sample.index]
tweets_samples.to_csv('tweets_samples.csv', index=False)

In [99]:
human_sample = pd.read_csv('human_classified.csv')

In [102]:
human_sample.index = human_sample['ID'] - 1

In [60]:
# Comparate human sample with final classified tweets
human_sample = pd.concat([human_sample, sample], axis=1)

# Get precision, recall and f1-score
TP = human_sample[(human_sample['Tweet Content'] == 1) & (human_sample['final_vote'] == 1)].shape[0]
FP = human_sample[(human_sample['Tweet Content'] == 0) & (human_sample['final_vote'] == 1)].shape[0]
FN = human_sample[(human_sample['Tweet Content'] == 1) & (human_sample['final_vote'] == 0)].shape[0]
TN = human_sample[(human_sample['Tweet Content'] == 0) & (human_sample['final_vote'] == 0)].shape[0]

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1_score}')

Precision: 0.0625
Recall: 1.0
F1-Score: 0.11764705882352941


In [52]:
human_sample

Unnamed: 0,ID,Tweet Content,final_vote
1501,1502,0,1.0
2586,2587,0,1.0
2653,2654,0,1.0
1055,1056,0,1.0
705,706,0,1.0
...,...,...,...
4740,4741,0,0.0
2940,2941,0,1.0
3456,3457,0,1.0
373,374,0,1.0
