In [None]:
# Exploración del Dataset y Entrenamiento del Modelo
def eda():
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from wordcloud import WordCloud


    data = pd.read_csv('twitter sentiment analysis.csv')


    data['viral'] = data['retweets'] + data['likes'] > 100


    viral_counts = data['viral'].value_counts()
    plt.figure(figsize=(8, 5))
    sns.barplot(x=viral_counts.index, y=viral_counts.values, palette='viridis')
    plt.title('Distribución de Tuits Virales')
    plt.xlabel('Es Viral')
    plt.ylabel('Cantidad')
    plt.xticks(ticks=[0, 1], labels=['No', 'Sí'])
    plt.show()


    text = ' '.join(data[data['viral']]['text'].astype(str))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Nube de Palabras de Tuits Virales')
    plt.show()

def train_model():
    import pandas as pd
    import pickle
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    import nltk

    nltk.download('punkt')
    nltk.download('wordnet')
    nltk.download('stopwords')


    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def preprocess(text):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
        return ' '.join(tokens)


    data = pd.read_csv('tweets.csv')
    data['viral'] = (data['retweets'] + data['likes']) > 100
    data['clean_text'] = data['text'].astype(str).apply(preprocess)


    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(data['clean_text']).toarray()
    y = data['viral']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    model = RandomForestClassifier()
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))


    with open('model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)

    with open('vectorizer.pkl', 'wb') as vec_file:
        pickle.dump(vectorizer, vec_file)

if __name__ == '__main__':
    eda()
    train_model()
