In [1]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import spacy
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA


import statsmodels.api as sm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('corpus')

# Configurar visualización
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')


2024-11-16 23:10:02.474181: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-16 23:10:02.534166: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-16 23:10:02.558627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731816602.608902 1033524 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731816602.621320 1033524 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 23:10:02.707263: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
#Cargamos los csv EDA
subreddit_data = pd.read_csv('../data/subreddit_data.csv')
posts_data = pd.read_csv('../data/posts_data.csv')
subreddit_data = subreddit_data.fillna("")
posts_data = posts_data.fillna("")
combined_data = posts_data.merge(subreddit_data, on='subreddit_id', how='left')

# Tokenizacion y Lematizacion

In [3]:
class RedditTextProcessor:
    def __init__(self):
        # Inicializar el lematizador
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.columns_to_process = ["titlePost", "tittleSubreddit", "descriptionReddit"]

    def preprocess_text(self, text):
        # Tokenización
        text = str(text)
        tokens = word_tokenize(text.lower())
        # Lematización y filtrado de stopwords
        tokens = [
            self.lemmatizer.lemmatize(token) for token in tokens 
            if token.isalpha() and token not in self.stop_words
        ]
        # Se devuelve una cadena y no una lista de tokens para los modelos de aprendizaje
        return " ".join(tokens)
    
    def process_dataframe(self, df):
        for column in df.columns:
            if df[column].dtype == object and column  in self.columns_to_process:
                df[column] = df[column].apply(self.preprocess_text)
        return df

In [4]:
textProcessor = RedditTextProcessor()
processed_data = textProcessor.process_dataframe(combined_data)

# Clustering

In [5]:
class RedditClustering:
    def __init__(self, processed_data):
        #Transformacion de texto a valor numerica para procesar datos
        self.vectorizer_title = TfidfVectorizer(max_df=0.5, min_df=5, stop_words='english')
        self.vectorizer_subreddit = TfidfVectorizer(max_df=0.5, min_df=5, stop_words='english')
        self.vectorizer_description = TfidfVectorizer(max_df=0.5, min_df=5, stop_words='english')

        tfidf_title = self.vectorizer_title.fit_transform(processed_data['titlePost'])
        tfidf_subreddit = self.vectorizer_subreddit.fit_transform(processed_data['tittleSubreddit'])
        tfidf_description = self.vectorizer_description.fit_transform(processed_data['descriptionReddit'])

        combined_tfidf = hstack([tfidf_title, tfidf_subreddit, tfidf_description])

        post_numeric_features  = StandardScaler().fit_transform(processed_data[['upVotes', 'scorePost', 'commentsPost']])
        self.combined_features = hstack([combined_tfidf, post_numeric_features])
        
        
    def get_k_values(self):
        distortions = []
        silhouette_scores = []
        K = range(2, 10)
        for k in K:
            kmeans_model = KMeans(n_clusters=k, random_state=42)
            kmeans_model.fit(self.combined_features)
            distortions.append(kmeans_model.inertia_)
            silhouette_avg = silhouette_score(self.combined_features, kmeans_model.labels_)
            silhouette_scores.append(silhouette_avg)
            print(f"Para k={k}, el coeficiente de silueta es {silhouette_avg}")

        # Visualización del método del codo
        plt.figure(figsize=(10, 5))
        plt.plot(K, distortions, 'bx-')
        plt.xlabel('Número de clusters')
        plt.ylabel('Distorsión')
        plt.title('Método del Codo para K óptimo')
        plt.show()

        # Visualización del coeficiente de silueta
        plt.figure(figsize=(10, 5))
        plt.plot(K, silhouette_scores, 'bx-')
        plt.xlabel('Número de clusters')
        plt.ylabel('Coeficiente de Silueta')
        plt.title('Coeficiente de Silueta para K óptimo')
        plt.show()

    def apply_kmeans(self, k, df):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(self.combined_features)
        clusters = kmeans.labels_
        df['Cluster'] = clusters
        score = silhouette_score(self.combined_features, clusters)
        print("Silhouette Score para clusters:", score)
        return df
    def analyze_clusters(self, df, n_keywords=5):
        cluster_names = {}
        
        for cluster_id in sorted(df['Cluster'].unique()):
            titles_in_cluster = df[df['Cluster'] == cluster_id]['titlePost']
            
            tfidf_matrix = self.vectorizer_title.transform(titles_in_cluster)
            sum_tfidf = tfidf_matrix.sum(axis=0)
            keywords = [(self.vectorizer_title.get_feature_names_out()[i], sum_tfidf[0, i]) 
                        for i in range(sum_tfidf.shape[1])]
            
            sorted_keywords = sorted(keywords, key=lambda x: x[1], reverse=True)[:n_keywords]
            top_keywords = [word for word, score in sorted_keywords]
            
            cluster_name = " / ".join(top_keywords)
            cluster_names[cluster_id] = cluster_name
            
            print(f"Cluster {cluster_id}: {cluster_name}")

        # Asignar nombres a los clusters en el DataFrame
        df['Cluster_Name'] = df['Cluster'].map(cluster_names).astype('category')
        return df
    def assign_subreddit_names(self, df):
        cluster_names = {}

        for cluster_id in sorted(df['Cluster'].unique()):
            subreddits_in_cluster = df[df['Cluster'] == cluster_id]['tittleSubreddit']
            
            most_common_subreddit = subreddits_in_cluster.mode()[0]
            
            cluster_names[cluster_id] = most_common_subreddit
            
            print(f"Cluster {cluster_id}: {most_common_subreddit}")

        df['Cluster_Name'] = df['Cluster'].map(cluster_names).astype('category')
        return df


clustering = RedditClustering(processed_data)
#clustering.get_k_values()


In [6]:
processed_data = clustering.apply_kmeans(120, processed_data)
processed_data = clustering.analyze_clusters(processed_data)
print(processed_data[['Cluster', 'Cluster_Name']].drop_duplicates())
processed_data.to_csv('../data/processed_data.csv', index=False)

Silhouette Score para clusters: 0.29287031958289295
Cluster 0: bro / man / dude / nice / boy
Cluster 1: warframe / prime / new / guy / like
Cluster 2: jagex / osrs / time / runescape / rot
Cluster 3: year / know / olympic / old / japan
Cluster 4: year / water / world / look / old
Cluster 5: trump / say / people / game / biden
Cluster 6: max / lewis / race / hamilton / verstappen
Cluster 7: people / thing / actually / think / worst
Cluster 8: guide / cool / world / type / different
Cluster 9: time / got / think / good / knew
Cluster 10: cosplay / shadowheart / larian / astarion / karlach
Cluster 11: trump / harris / kamala / president / donald
Cluster 12: trump / donald / harris / year / biden
Cluster 13: dad / daughter / love / man / year
Cluster 14: pal / palworld / base / game / like
Cluster 15: russian / drone / soldier / ukrainian / fpv
Cluster 16: car / money / credit / bank / year
Cluster 17: people / aitah / movie / husband / year
Cluster 18: time / got / day / good / man
Cluste

In [7]:
processed_data = pd.read_csv('../data/processed_data.csv')
processed_data.head()

Unnamed: 0,subreddit_id,titlePost,createdPost,scorePost,upVotedRatio,upVotes,commentsPost,tittleSubreddit,subscribersReddit,descriptionReddit,createdReddit,Cluster,Cluster_Name
0,0,mortgage rate,2024-04-24 04:02:06,22171,0.97,22171,1655,home,256481,,2009-01-25 02:25:57,56,year / game / time / got / man
1,0,parent bought house month ago company flipped ...,2024-05-17 21:16:37,19064,0.89,19064,5813,home,256481,,2009-01-25 02:25:57,94,aitah / aita / wife / telling / husband
2,0,pella window would ok reacting,2024-08-27 20:18:19,3531,0.98,3531,1739,home,256481,,2009-01-25 02:25:57,32,roommate / house / room / home / bad
3,0,basement floor leak,2024-07-16 17:54:12,1878,0.98,1878,569,home,256481,,2009-01-25 02:25:57,32,roommate / house / room / home / bad
4,0,builder told normal home something fix,2024-05-12 03:42:43,1790,0.95,1790,634,home,256481,,2009-01-25 02:25:57,32,roommate / house / room / home / bad


# ARIMA

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings
warnings.filterwarnings('ignore')

def analyze_reddit_trends(df):
    """
    Analiza y predice tendencias en datos de Reddit usando series temporales.
    
    Parameters:
    df: DataFrame con las columnas createdPost, Cluster, scorePost, upVotes, commentsPost
    
    Returns:
    dict con resultados del análisis y predicciones
    """
    # Convertir la fecha a datetime si no lo está
    df['createdPost'] = pd.to_datetime(df['createdPost'])
    
    # Crear características agregadas por día y cluster
    daily_metrics = df.groupby([pd.Grouper(key='createdPost', freq='D'), 'Cluster']).agg({
        'scorePost': 'mean',
        'upVotes': 'sum',
        'commentsPost': 'sum',
        'Cluster': 'count'  # Cuenta de posts por cluster
    }).rename(columns={'Cluster': 'post_count'})
    
    # Crear un score de engagement
    daily_metrics['engagement_score'] = (
        StandardScaler().fit_transform(daily_metrics[['scorePost']]) * 0.3 +
        StandardScaler().fit_transform(daily_metrics[['upVotes']]) * 0.4 +
        StandardScaler().fit_transform(daily_metrics[['commentsPost']]) * 0.3
    )
    
    # Identificar tendencias actuales
    recent_trends = identify_current_trends(daily_metrics)
    
    # Predecir tendencias futuras
    future_trends = predict_future_trends(daily_metrics)
    
    return {
        'current_trends': recent_trends,
        'future_predictions': future_trends
    }

def identify_current_trends(daily_metrics):
    """
    Identifica las tendencias actuales basadas en el engagement reciente
    """
    # Obtener los últimos 30 días de datos
    last_date = daily_metrics.index.get_level_values(0).max()
    start_date = last_date - timedelta(days=30)
    
    recent_data = daily_metrics.loc[start_date:last_date]
    
    # Calcular el promedio de engagement por cluster
    cluster_trends = recent_data.groupby(level=1)['engagement_score'].mean()
    
    # Identificar los clusters más relevantes
    top_clusters = cluster_trends.nlargest(5)
    
    return {
        'top_clusters': top_clusters.to_dict(),
        'trend_period': f"{start_date.date()} to {last_date.date()}"
    }

def predict_future_trends(daily_metrics):
    """
    Predice tendencias futuras usando modelo Holt-Winters
    """
    predictions = {}
    
    # Para cada cluster, crear una predicción
    for cluster in daily_metrics.index.get_level_values(1).unique():
        # Obtener datos del cluster
        cluster_data = daily_metrics.xs(cluster, level=1)['engagement_score']
        
        # Aplicar modelo Holt-Winters si hay suficientes datos
        if len(cluster_data) >= 14:  # Mínimo 2 semanas de datos
            model = ExponentialSmoothing(
                cluster_data,
                seasonal_periods=7,  # Patrón semanal
                trend='add',
                seasonal='add'
            )
            
            try:
                fitted_model = model.fit()
                # Predecir próximos 14 días
                forecast = fitted_model.forecast(14)
                
                # Calcular tendencia
                current_avg = cluster_data[-7:].mean()  # Último promedio semanal
                predicted_avg = forecast.mean()  # Promedio predicho
                trend_direction = "up" if predicted_avg > current_avg else "down"
                
                predictions[cluster] = {
                    'trend_direction': trend_direction,
                    'trend_strength': abs(predicted_avg - current_avg),
                    'forecast_values': forecast.to_dict()
                }
            except:
                continue
    
    return predictions

def get_trend_insights(results, cluster_names):
    """
    Genera insights legibles sobre las tendencias
    """
    insights = {
        'current_trends': [],
        'future_predictions': []
    }
    
    # Analizar tendencias actuales
    for cluster, score in results['current_trends']['top_clusters'].items():
        insights['current_trends'].append({
            'cluster': cluster_names.get(cluster, f"Cluster {cluster}"),
            'engagement_level': 'Alto' if score > 0.5 else 'Medio' if score > 0 else 'Bajo'
        })
    
    # Analizar predicciones
    for cluster, pred in results['future_predictions'].items():
        if pred['trend_strength'] > 0.5:  # Solo reportar cambios significativos
            insights['future_predictions'].append({
                'cluster': cluster_names.get(cluster, f"Cluster {cluster}"),
                'prediction': 'Aumentará' if pred['trend_direction'] == 'up' else 'Disminuirá',
                'confidence': 'Alta' if pred['trend_strength'] > 1 else 'Media'
            })
    
    return insights




In [9]:
cluster_names = processed_data.set_index('Cluster')['Cluster_Name'].to_dict()
results = analyze_reddit_trends(processed_data)
insights = get_trend_insights(results, cluster_names)


print("Tendencias Actuales:")
for trend in insights['current_trends']:
    print(f"- {trend['cluster']}: Nivel de engagement {trend['engagement_level']}")

print("\nPredicciones:")
for pred in insights['future_predictions']:
    print(f"- {pred['cluster']}: {pred['prediction']} (Confianza: {pred['confidence']})")

Tendencias Actuales:
- president / man / day / year / today: Nivel de engagement Alto
- year / man / guy / like / work: Nivel de engagement Alto
- year / picture / cat / old / time: Nivel de engagement Alto
- year / know / olympic / old / japan: Nivel de engagement Alto
- til / year / know / peter / guy: Nivel de engagement Alto

Predicciones:
- til / year / know / peter / guy: Aumentará (Confianza: Media)
- year / know / olympic / old / japan: Aumentará (Confianza: Media)
