# Dataset

In [1]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
import random

In [2]:
# Dataset in urls
urls = [
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_1.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_2.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_3.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_4.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_5.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_6.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_7.csv",
    "https://raw.githubusercontent.com/JoseChirif/NPL-project/main/data/parte_8.csv",
]

In [3]:
# Dataset headers
columnas = ["polarity", "id", "date", "query", "user", "text"]

# Read CSV files without headers and assign column names.
dataframes = [pd.read_csv(url, header=None, names=columnas) for url in urls]

# Combine all DataFrames into a single one
dataset = pd.concat(dataframes, ignore_index=True)

# Display the result
print(f'Combined dataset has {dataset.shape[0]} rows and {dataset.shape[1]} columns.')
dataset.head()

Combined dataset has 1600007 rows and 6 columns.


Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
dataset['polarity'].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
0,800007
4,800000


In [5]:
# Filter only the relevant columns (polarity and text)
dataset = dataset[['polarity', 'text']]

# Convert the polarity labels to make them more readable.
dataset['polarity'] = dataset['polarity'].map({0: 'negativo', 2: 'neutro', 4: 'positivo'})

print('dataset relevant columns')
dataset.head()

dataset relevant columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['polarity'] = dataset['polarity'].map({0: 'negativo', 2: 'neutro', 4: 'positivo'})


Unnamed: 0,polarity,text
0,negativo,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,negativo,is upset that he can't update his Facebook by ...
2,negativo,@Kenichan I dived many times for the ball. Man...
3,negativo,my whole body feels itchy and like its on fire
4,negativo,"@nationwideclass no, it's not behaving at all...."


In [6]:
# Function to clean the text
def clear_text(text):
    text = re.sub(r'http\S+||www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs.
    text = re.sub(r'\@@w+|\#', '', text) # Remove mentions and hashtags
    text = text.strip() # Remove trailing spaces
    return text

# Apply cleanup function
dataset['text'] = dataset['text'].apply(clear_text)
dataset.head()


Unnamed: 0,polarity,text
0,negativo,@switchfoot - A that's a bummer. You shoulda...
1,negativo,is upset that he can't update his Facebook by ...
2,negativo,@Kenichan I dived many times for the ball. Man...
3,negativo,my whole body feels itchy and like its on fire
4,negativo,"@nationwideclass no, it's not behaving at all...."


In [7]:
dataset.head(100)

Unnamed: 0,polarity,text
0,negativo,@switchfoot - A that's a bummer. You shoulda...
1,negativo,is upset that he can't update his Facebook by ...
2,negativo,@Kenichan I dived many times for the ball. Man...
3,negativo,my whole body feels itchy and like its on fire
4,negativo,"@nationwideclass no, it's not behaving at all...."
...,...,...
95,negativo,Strider is a sick little puppy
96,negativo,"so rylee,grace...wana go steve's party or not?..."
97,negativo,"hey, I actually won one of my bracket pools! T..."
98,negativo,"@stark YOU don't follow me, either and i work..."


In [8]:
dataset['polarity'].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
negativo,800007
positivo,800000


# NPL project

In [9]:
# Reduce the DataFrame to the columns 'polarity' and 'text'.
dataset = dataset[['polarity', 'text']]

# Convert 'polarity' to 0 and 1 (if not already in that format)
dataset['polarity'] = dataset['polarity'].apply(lambda x: 1 if x == 4 else 0)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(
    dataset['text'], dataset['polarity'], test_size=0.2, random_state=42
)

#  Convert text and labels to tensors
X_train = tf.convert_to_tensor(X_train, dtype=tf.string)
X_test = tf.convert_to_tensor(X_test, dtype=tf.string)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)

# Create a TextVectorization to process the text
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=100)
vectorizer.adapt(X_train)

# Build the classification model
model = Sequential([
    vectorizer, # Layer to convert text to integer sequences
    Embedding(input_dim=20000, output_dim=128), # Embedding layer
    GlobalAveragePooling1D(), # Global reduction
    Dropout(0.5), # Regularization
    Dense(128, activation='relu'),
    Dropout(0.5), # Regularization
    Dense(1, activation='sigmoid'), # Binary classification
])

#  Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Accuracy: {accuracy:.2f}")
print(f"Loss: {loss:.2f}")



Epoch 1/5
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1815s[0m 45ms/step - accuracy: 0.9998 - loss: 9.9147e-04 - val_accuracy: 1.0000 - val_loss: 6.7406e-17
Epoch 2/5
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1779s[0m 44ms/step - accuracy: 1.0000 - loss: 4.5814e-11 - val_accuracy: 1.0000 - val_loss: 4.5615e-18
Epoch 3/5
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1776s[0m 44ms/step - accuracy: 1.0000 - loss: 1.1512e-11 - val_accuracy: 1.0000 - val_loss: 2.0351e-18
Epoch 4/5
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1705s[0m 41ms/step - accuracy: 1.0000 - loss: 9.1610e-12 - val_accuracy: 1.0000 - val_loss: 9.9461e-19
Epoch 5/5
[1m40001/40001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1706s[0m 43ms/step - accuracy: 1.0000 - loss: 2.6949e-12 - val_accuracy: 1.0000 - val_loss: 7.5426e-19
[1m10001/10001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 5ms/step - accuracy: 1.0000 - loss: 7.5336e-

# Conversation between bots

In [10]:
# Function to classify the sentiment of a message
def classify_sentiment(model, text):
    text_tensor = tf.convert_to_tensor([text])  # Convert the text to tensor
    prediction = model.predict(text_tensor)[0][0]  # Get the prediction
    return "positive" if prediction > 0.7 else "negative" if prediction < 0.3 else "neutral"

# Function to select a response from the dataset based on the detected sentiment
def generate_response(dataset, sentiment):
    if sentiment == "positive":
        responses = dataset[dataset['polarity'] == 1]['text'].tolist()
    elif sentiment == "negative":
        responses = dataset[dataset['polarity'] == 0]['text'].tolist()
    else:
        responses = dataset['text'].tolist()
    return random.choice(responses)

# Simulate a conversation between two bots
def simulate_conversation(model, dataset, initial_message, response_count):
    current_message = initial_message
    conversation = []  # Store the complete conversation
    for i in range(response_count):
        sentiment = classify_sentiment(model, current_message)
        response = generate_response(dataset, sentiment)
        conversation.append(f"Bot {i % 2 + 1}: {current_message}")
        current_message = response
    return conversation

# User inputs
initial_message = input("Enter the initial message for the Bot: ")
response_count = int(input("Enter the number of responses to generate: "))

# Simulate the conversation
conversation = simulate_conversation(model, dataset, initial_message, response_count)

# Display the conversation
print("\nConversation between the bots:")
for message in conversation:
    print(message)

# Analyze the conversation's sentiment
def analyze_conversation_sentiment(model, conversation):
    sentiments = []
    for message in conversation:
        text = message.split(":")[1].strip()  # Extract the text
        sentiment = classify_sentiment(model, text)
        sentiments.append(sentiment)
    # Count positives, negatives, and neutrals
    analysis = {
        "positive": sentiments.count("positive"),
        "negative": sentiments.count("negative"),
        "neutral": sentiments.count("neutral"),
    }
    return analysis

# Example of using the function
sentiment_analysis = analyze_conversation_sentiment(model, conversation)

# Display sentiment analysis results
print("\nConversation Sentiment Analysis:")
print(f"Positive: {sentiment_analysis['positive']}")
print(f"Negative: {sentiment_analysis['negative']}")
print(f"Neutral: {sentiment_analysis['neutral']}")

Enter the initial message for the Bot: Hello World!
Enter the number of responses to generate: 30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━