In [29]:
# Load libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Visualization

import tensorflow as tf # Deep Learning
import tensorflow_hub as hub # Pre-trained models
 

# Data Preprocessing

In [30]:
df = pd.read_csv('Twitter_Data.csv') # Read the data only with the columns that we need
df.head() # Show the first 5 rows of the data
df2 = pd.read_csv('Reddit_Data.csv')
df2 = df2.rename(columns={'clean_comment': 'clean_text'})
df = pd.concat([df, df2])

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200229 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  200125 non-null  object 
 1   category    200222 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.6+ MB


As we can see, there are missing values in the data. We will drop the missing values. Because it is a small amount of data.

In [32]:
df.dropna(inplace=True) # Drop the missing values

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200118 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  200118 non-null  object 
 1   category    200118 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.6+ MB


In [34]:
df['category'].unique()

array([-1.,  0.,  1.])

Now need to convert the labels to numerical values. We will convert the labels to 0, 1, 2. 0 for Negative, 1 for Neutral, 2 for Positive.

In [35]:
df['category'] = df['category'].apply(lambda x: 0 if x == -1 else (1 if x == 0 else 2)) # Convert the labels to numerical values

In [36]:
df['category'].unique()
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,0
1,talk all the nonsense and continue all the dra...,1
2,what did just say vote for modi welcome bjp t...,2
3,asking his supporters prefix chowkidar their n...,2
4,answer who among these the most powerful world...,2


Check if data is balanced

In [37]:
len(df[df['category'] == 0]) , len(df[df['category'] == 1]) , len(df[df['category'] == 2]) # Count the number of each label

(43786, 68253, 88079)

Data is balanced. Now we can convert the data to a tensorflow dataset.

In [38]:
# function that converts a panda dataframe to a tensorflow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=1024): # Function to convert a dataframe to a dataset
    df = dataframe.copy() # avoid modifying the original dataframe
    labels = df.pop("category") # Remove the label column from the dataframe
    df = df["clean_text"] # Keep only the description column
    ds = tf.data.Dataset.from_tensor_slices((df, labels)) # Create a tensorflow dataset
    if shuffle: # Shuffle the dataset if needed
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)  # Batch the dataset
    ds = ds.prefetch(tf.data.AUTOTUNE) # automatically tune the number of batches to prefetch based on the current runtime conditions
    return ds

In [39]:
train, val , test = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9*len(df))]) # Split the data into train, validation and test sets with 80%, 10% and 10% of the data 

  return bound(*args, **kwds)


In [40]:
train_data = df_to_dataset(train) # Convert the train dataframe to a tensorflow dataset
val_data = df_to_dataset(val) # Convert the validation dataframe to a tensorflow dataset
test_data = df_to_dataset(test) # Convert the test dataframe to a tensorflow dataset

# Model

In [41]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2" # Load the embedding model
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True) # Create a keras layer with the embedding model

ValueError: Trying to load a model of incompatible/unknown type. '/var/folders/71/vnznjlz52qj_1p44k0hb7sw00000gn/T/tfhub_modules/74a841d6eb84e8d93d913d716fb5440d020cc291' contains neither 'saved_model.pb' nor 'saved_model.pbtxt'.

In [None]:
model = tf.keras.Sequential() # Create a sequential model
model.add(hub_layer) # Add the pre-trained layer

# Assuming the output of hub_layer is a flat vector. This is just a conceptual example.
# In practice, CNNs need sequence or matrix inputs.
#model.add(tf.keras.layers.Reshape((50, 1)))  # Reshape for CNN, if necessary
#model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'))  # Example CNN layer
#model.add(tf.keras.layers.GlobalMaxPooling1D())  # Pooling layer to reduce dimensionality


model.add(tf.keras.layers.Dense(16, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.01))) # Add a hidden layer with 16 neurons
model.add(tf.keras.layers.Dense(3, activation='softmax')) 

In [None]:
model.compile(
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001), # gradient descent algorithm
    loss =  tf.keras.losses.SparseCategoricalCrossentropy(), # loss function
    metrics = ['accuracy']
)

In [None]:
model.evaluate(train_data) # Evaluate the model on the test data

In [None]:
model.evaluate(val_data) # Evaluate the model on the validation data

In [None]:
history = model.fit(
    train_data,
    validation_data = val_data,
    epochs = 10
)

In [None]:
plt.plot(history.history['accuracy'], label='accuracy') # Plot the accuracy of the model on the train data
plt.plot(history.history['val_accuracy'], label = 'val_accuracy') # Plot the accuracy of the model on the validation data
plt.title('Accuracy of the model')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss') # Plot the accuracy of the model on the train data
plt.plot(history.history['val_loss'], label = 'val_loss') # Plot the accuracy of the model on the validation data
plt.title('Loss of the model')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
model.evaluate(test_data) # Evaluate the model on the test data

In [None]:
def predict_sentiment(model, text):
    
    text = text.lower() # Convert the text to lowercase
    # Convert the text to a pandas DataFrame
    df = pd.DataFrame([text], columns=['Tweet'])

    # Convert the DataFrame to a tensorflow dataset
    ds = tf.data.Dataset.from_tensor_slices(df["Tweet"]).batch(1)

    # Use the model to predict the sentiment
    prediction = model.predict(ds)

    # Get the index of the maximum value (this will be the predicted label)
    predicted_label = np.argmax(prediction)

    # Map the numerical label back to the original label
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_sentiment = sentiment_map[predicted_label]

    # Print the predicted sentiment
    print(f"{text} is {predicted_sentiment}")

In [None]:
predict_sentiment(model, "I am Gilad Fisher") 
predict_sentiment(model, "I am Barak Finkel")
predict_sentiment(model, "I am Nikita Breslavsky")
predict_sentiment(model, "I hate Nikita Breslavsky")
predict_sentiment(model, "I love Nikita Breslavsky")
predict_sentiment(model, "I love to kill people")
predict_sentiment(model, "I dont love myself")