In [48]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from unidecode import unidecode
from os import listdir
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

In [2]:
PATH = r'C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data'

In [3]:
df = pd.DataFrame()
for year in listdir(PATH):
    for f in listdir(PATH + '\\' + year):
        dataset_from_files_in_path = pd.read_csv(PATH + '\\' + year + '\\' + f)
        df = pd.concat([df, dataset_from_files_in_path])
    df.drop(columns='Unnamed: 0', inplace=True)

In [4]:
df.drop(columns='Unnamed: 0.1', inplace=True)

In [5]:
df

Unnamed: 0,user_id,conversation_id,date,time,tweet
0,1430944412,958912747621806080,2018-01-31,23:59:49,Insônia com ansiedade te faz gerar muitas para...
1,940021578624065537,958912695411052545,2018-01-31,23:59:36,"faltam 16 dias pro meu filho nascer, a ansieda..."
2,139421250,958912686120685573,2018-01-31,23:59:34,Frustração é sempre dizer que vai ser a ultima...
3,937692691621646336,958912558601293824,2018-01-31,23:59:04,A ansiedade nos consome até o dia em que decid...
4,176789934,958912363855532032,2018-01-31,23:58:17,"Dormir é um desafio pra quem tem ansiedade , p..."
...,...,...,...,...,...
444892,714256234208182272,1377470832973348864,2021-04-01,00:00:13,A ansiedade como manda embora o sono
444893,1066797087017365504,1377470832679788552,2021-04-01,00:00:13,a pessoa vem se achando a dona da razão com um...
444894,758330548158889984,1377470831798935552,2021-04-01,00:00:13,Minha ansiedade tá uma merda essa semana eu só...
444895,1062583509590704130,1377466414580240393,2021-04-01,00:00:08,@MillaPadilha78 @HugoGloss @eusoubarbz Enfia e...


In [6]:
frequency = df[['date']]
frequency['quantidade'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequency['quantidade'] = 1


In [7]:
total_de_tweets = frequency.shape[0]

In [8]:
frequency = frequency.groupby('date').sum()

In [9]:
frequency

Unnamed: 0_level_0,quantidade
date,Unnamed: 1_level_1
2018-01-01,1353
2018-01-02,2207
2018-01-03,2473
2018-01-04,2302
2018-01-05,2149
...,...
2021-04-26,16949
2021-04-27,18073
2021-04-28,15501
2021-04-29,15793


In [10]:
fig = px.line(frequency)
fig

In [11]:
frequency.index = pd.to_datetime(frequency.index)
frequency.index.freq = 'D'

In [12]:
frequency_per_month = frequency.groupby(pd.Grouper(freq='M')).sum()

In [13]:
fig = px.line(frequency_per_month)
fig

In [14]:
date = frequency.reset_index()
date = date['date']
day_name = date.dt.day_name()
dias_traduzidos = {'Monday': 'Segunda', 
                   'Tuesday': 'Terça',
                  'Wednesday': 'Quarta',
                  'Thursday': 'Quinta',
                  'Friday': 'Sexta',
                  'Saturday': 'Sábado',
                  'Sunday': 'Domingo'}
frequency.reset_index(inplace=True)
frequency['dia_da_semana'] = day_name.apply(lambda x: dias_traduzidos[x])
frequency.set_index('date')

Unnamed: 0_level_0,quantidade,dia_da_semana
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,1353,Segunda
2018-01-02,2207,Terça
2018-01-03,2473,Quarta
2018-01-04,2302,Quinta
2018-01-05,2149,Sexta
...,...,...
2021-04-26,16949,Segunda
2021-04-27,18073,Terça
2021-04-28,15501,Quarta
2021-04-29,15793,Quinta


In [15]:
frequency = frequency.groupby('dia_da_semana').sum()

In [16]:
fig = px.bar(frequency)

fig

In [17]:
def labelling(x):
    x = x.split()
    for i in range(len(x)):
        x[i] = unidecode(x[i].lower())

    label1 = ['terapia','psicoterapia', 'tratamento','psico', 'psicologo', 'psicologa', 'psiquiatra']
    label2 = ['crise', 'crises', 'surto', 'surtos', 'panico', 'morrer de ansiedade', 'ataque de panico', 'sem ar']
    label3 = ['depre', 'depressao','depressivo','depressiva']
    label4 = ['suicidio', 'suicida', 'se matar', 'me matar', 'mutilar']

    if any(y in label4 for y in x):
        return 4
    elif any(y in label3 for y in x):
        return 3
    elif any(y in label2 for y in x):
        return 2
    elif any(y in label1 for y in x):
        return 1
    else:
        return 0

In [18]:
labelled = df.sample(3000000)

In [19]:
labelled['label'] = labelled['tweet'].apply(labelling)

In [20]:
labelled['label'].value_counts()

0    2399231
2     507699
3      70561
1      19758
4       2751
Name: label, dtype: int64

In [21]:
labelled

Unnamed: 0,user_id,conversation_id,date,time,tweet,label
468048,1319633268170510336,1367207535829282823,2021-03-03,16:17:32,"n assistam Ginny e Georgia pfvr, eu to com ans...",0
62809,590626746,1186311949723852800,2019-10-21,12:03:04,Vai chegando perto do acamppp a ansiedade vai ...,0
62074,1204163151337598976,1298447390744158209,2020-08-25,22:29:16,mt bom ter ansiedade amo essa vontade de vomit...,0
49523,1224152772871884800,1376649922888499219,2021-03-29,19:35:32,@_ChutaChuta_ Vou morrer até lá de ansiedade,0
367414,153952163,1379415200298831877,2021-04-06,08:46:26,Não aguento mais sentir ansiedade.,0
...,...,...,...,...,...,...
78200,2744003091,1150213477950205952,2019-07-13,21:20:37,"minha ansiedade hoje ""foi embora"" então aprove...",0
80298,1176314914375385094,1364360608762716163,2021-02-23,19:44:52,Comprei meu primeiro livro de romance gay e eu...,0
60484,55434295,1051294324187717632,2018-10-13,22:11:15,Agora eu tô tratando e já não sinto mais esse ...,0
85405,839562448764080133,1104964392456765441,2019-03-11,00:36:55,"E aí ansiedade,vamo deixar eu dormir querida?",0


In [27]:
X = labelled[['tweet']]
y = labelled[['label']]

In [29]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)

In [39]:
X_under, y_under = undersample.fit_resample(X_under, y_under)

In [40]:
y_under.value_counts()

label
0        2751
1        2751
2        2751
3        2751
4        2751
dtype: int64

In [42]:
X_under['label'] = y_under

In [45]:
X_under.rename(columns={'tweet': 'texts', 'label': 'targets'}).to_csv(r'C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data\dataset\dataset.csv')

In [46]:
dataset = pd.read_csv(r'C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data\dataset\dataset.csv')

In [49]:
X_train, X_test, y_train, y_test = train_test_split(dataset['texts'], dataset['targets'], test_size=0.30, random_state=42, stratify=dataset['targets'])

In [50]:
dataset_test = pd.DataFrame(X_test)
dataset_test['targets'] = y_test
X_test, X_valid, y_test, y_valid = train_test_split(dataset_test['texts'], dataset_test['targets'], test_size=0.10, random_state=42, stratify=dataset_test['targets'])

In [51]:
dataset_train = pd.DataFrame(X_train)
dataset_train['targets'] = y_train

In [52]:
dataset_test = pd.DataFrame(X_test)
dataset_test['targets'] = y_test

In [53]:
dataset_valid = pd.DataFrame(X_valid)
dataset_valid['targets'] = y_valid

In [54]:
dataset_train.to_csv(r"C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data\dataset\dataset_train.csv")
dataset_test.to_csv(r"C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data\dataset\dataset_test.csv")
dataset_valid.to_csv(r"C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data\dataset\dataset_valid.csv")