In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import plotly.express as px

In [2]:
# Descargar recursos de NLTK si no están disponibles
nltk.download('stopwords')
nltk.download('punkt_tab')

# Cargar los datos desde el archivo CSV
train_file = pd.read_csv("train.csv")

print(train_file)

# Información básica del dataset
print(train_file.info())

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  
7610  M1.94 [01:04 UT

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mague\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mague\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# Función para limpiar el texto de los tweets
def clean_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Quitar URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Quitar caracteres especiales y puntuación
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenización
    words = word_tokenize(text)
    # Quitar stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_text = [word for word in words if word not in stop_words]
    return ' '.join(cleaned_text)

# Aplicar la función de limpieza a la columna de texto
train_file['clean_text'] = train_file['text'].apply(clean_text)

# Mostrar las primeras filas con el texto limpio
print(train_file[['text', 'clean_text']].head())

train_file.to_csv("train-cleaned.csv")

                                                text  \
0  Our Deeds are the Reason of this #earthquake M...   
1             Forest fire near La Ronge Sask. Canada   
2  All residents asked to 'shelter in place' are ...   
3  13,000 people receive #wildfires evacuation or...   
4  Just got sent this photo from Ruby #Alaska as ...   

                                          clean_text  
0       deeds reason earthquake may allah forgive us  
1              forest fire near la ronge sask canada  
2  residents asked shelter place notified officer...  
3  people receive wildfires evacuation orders cal...  
4  got sent photo ruby alaska smoke wildfires pou...  


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.probability import FreqDist
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Cargar los datos desde el archivo CSV
train_file = pd.read_csv("train-cleaned.csv")

# Separar los tweets por categoría
disaster_tweets = train_file[train_file['target'] == 1]
non_disaster_tweets = train_file[train_file['target'] == 0]

# Unir todos los textos en cada categoría
all_words_disaster = ' '.join(disaster_tweets['clean_text'])
all_words_non_disaster = ' '.join(non_disaster_tweets['clean_text'])

# Crear frecuencias de palabras
fdist_disaster = FreqDist(all_words_disaster.split())
fdist_non_disaster = FreqDist(all_words_non_disaster.split())

In [5]:
from dash import Dash, dcc, html
import plotly.express as px
import pandas as pd

app = Dash(__name__)

# Datos para el gráfico de palabras más comunes en tweets de desastre
common_words_disaster = fdist_disaster.most_common(30)
df_disaster = pd.DataFrame(common_words_disaster, columns=['Palabra', 'Frecuencia'])

# Crear el gráfico de barras para los tweets de desastre
fig_disaster = px.bar(df_disaster, x='Palabra', y='Frecuencia', title='Frecuencia de Palabras - Tweets de Desastres')

# Datos para el gráfico de palabras más comunes en tweets de no desastre
common_words_non_disaster = fdist_non_disaster.most_common(30)
df_non_disaster = pd.DataFrame(common_words_non_disaster, columns=['Palabra', 'Frecuencia'])

# Crear el gráfico de barras para los tweets de no desastre
fig_non_disaster = px.bar(df_non_disaster, x='Palabra', y='Frecuencia', title='Frecuencia de Palabras - Tweets de No Desastres')

app.layout = html.Div(children=[
    html.H1(children='Análisis de Tweets de Desastres y No Desastres'),

    html.Div(children='''
        Frecuencia de palabras más comunes.
    '''),

    # Gráfico de palabras más comunes en tweets de desastres
    dcc.Graph(
        id='disaster-words',
        figure=fig_disaster
    ),

    # Gráfico de palabras más comunes en tweets de no desastres
    dcc.Graph(
        id='non-disaster-words',
        figure=fig_non_disaster
    )
])

# Ejecutar la aplicación
if __name__ == '__main__':
    app.run_server(debug=True)