In [58]:
# Limpieza de datos
 
import pandas as pd
import pandera as pa
from datetime import datetime

def write_logs(severity, operation, description, dataset):
    with open('./data/logs.txt', 'a') as file:
        file.write(f'[{datetime.now()}] - [{severity}] - [{operation}] - [{dataset}] - {description}\n')
 
df = pd.read_csv('./data/modified_tweets_with_diverse_hashtags.csv')
 
df["username"] = df["username"].astype(str)
df["full_name"] = df["full_name"].astype(str, errors="ignore")  # Convertir a string, ignorando errores
df["content"] = df["content"].astype(str) 
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df["retweets"] = pd.to_numeric(df["retweets"], errors="coerce").fillna(0).astype("Int64")
df["likes"] = pd.to_numeric(df["likes"], errors="coerce").fillna(0).astype("Int64")
df["replies"] = pd.to_numeric(df["replies"], errors="coerce").fillna(0).astype("Int64") 
df["verified"] = df["verified"].astype(bool)
 
# ¿Qué debería tener cada columna?
#Tenemos las columnas: Username Full_Name Content Created_at Retweet Likes Replies Verificado
# No podemos tener valores nulos en las columnas Username, Content, Created_at y Verificado
 
#Username: String no nulo
#Full_Name: String
#Content: String, no nulo
#Created_at: Datetime, no nulo, Limite superior la fecha actual. Limite infeior 2006.
#Retweets like replies: int
#Verificado: Boolean
 
df_schema = pa.DataFrameSchema({
    "username": pa.Column(pa.String, nullable=False),
    "full_name": pa.Column(pa.String, nullable=True),
    "content": pa.Column(pa.String, nullable=False),
    "created_at": pa.Column(pa.Timestamp, pa.Check(lambda x: (x < pd.Timestamp(datetime.now())) & (x >= pd.Timestamp('2006-01-01 00:00:00'))), nullable=False),
    "retweets": pa.Column(pa.Int, nullable=True),
    "likes": pa.Column(pa.Int, nullable=True),
    "replies": pa.Column(pa.Int, nullable=True),
    "verified": pa.Column(pa.Bool, nullable=False),
})
 
try:
    df_schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as err:
    df_error_cases = err.failure_cases
 
    #Guardamos los errores en un arhivo
    df_error_cases.to_csv('./data/error_cases.csv', index=False)

    error_indexes = df_error_cases['index'].values
    write_logs('INFO', 'DROP', f'Eliminando filas con valores nulos y errores de formato que no sigan el esquema.', 'unique_ai_tweets_with_issues.csv')
    df = df.drop(error_indexes)
    write_logs('INFO', 'WRITE', 'Guardando el archivo en formato csv.', 'tweets_bronze.csv')
 
df.to_csv('./data/tweets_bronze.csv', index=False)

In [None]:
# Transformación de datos
import re
import random

df = pd.read_csv('./data/tweets_bronze.csv')
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")  # Convertir a datetime



regex_hashtag = re.compile(r"#\w+")

df["hashtags"] = df["content"].str.findall(regex_hashtag)

write_logs('INFO', 'WRITE', 'Identificando hashtags y almacenando como CSV.', 'tweets_gold.csv')

df.to_csv('./data/tweets_silver.csv', index=False)

df_hashtags_raw = df.explode("hashtags")

df_hashtags_ocurrences = df_hashtags_raw[['hashtags', 'created_at']]

df_hashtags_ocurrences["month_year"] = df_hashtags_ocurrences["created_at"].dt.to_period("M")

df_hashtags = df_hashtags_ocurrences.groupby(["month_year", "hashtags"]).size().reset_index(name="count")

write_logs('INFO', 'WRITE', 'Agrupamos hashtags por mes y año', 'hashtags_gold.csv')

df_hashtags["score"] = [random.uniform(0.0, 1.0) for _ in range(df_hashtags.shape[0])]

df_hashtags["sentiment_score"] = [random.uniform(0.0, 1.0) for _ in range(df_hashtags.shape[0])]

write_logs('INFO', 'WRITE', 'Calculamos score y sentiment_score y almacenamos en CSV', 'hashtags_gold.csv')

df_hashtags.to_csv('./data/hashtags_gold.csv', index=False)

df_hashtags



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hashtags_ocurrences["month-year"] = df_hashtags_ocurrences["created_at"].dt.to_period("M")


Unnamed: 0,month-year,hashtags,count,score,sentiment_score
0,2024-01,#AI,56,0.450818,0.524071
1,2024-01,#AIForGood,69,0.064792,0.292858
2,2024-01,#AIForGoodTrends,1,0.523859,0.242818
3,2024-01,#ArtificialIntelligence,64,0.169814,0.042876
4,2024-01,#ArtificialIntelligenceTrends,2,0.201044,0.125549
...,...,...,...,...,...
279,2024-12,#NeuralNetworks,27,0.801296,0.462817
280,2024-12,#NeuralNetworksTrends,1,0.818700,0.752927
281,2024-12,#Robotics,32,0.944381,0.313452
282,2024-12,#Tech,38,0.760890,0.522086
