In [None]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight


from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.models import Model


In [None]:
threshold = 0.5 # confidence in sentiment threshold

def process_string(s):
    # Split the string on spaces
    words = s.split()

    # Remove first element
    words.pop(0)

    # Rejoin the list into a string
    return ' '.join(words)


df = pd.read_csv('AirlineTweets.csv',delimiter=",")
df = df[df['airline_sentiment_confidence']>threshold]
df = df[['airline_sentiment','text']]

# Apply the function to each row in the DataFrame
df['text'] = df['text'].apply(process_string)

sentiment = df["airline_sentiment"]
text = df["text"]



## Convert classes to numbers
target_map = {"positive":1,"negative":0,"neutral":2}
df['target'] = df['airline_sentiment'].map(target_map)


# Display the first few rows of the DataFrame
print(df.head())
print(f"Number of samples: {len(df)}")



In [None]:
# Check for class imbalance (HUUGE)

df['airline_sentiment'].hist()

In [None]:
# df = pd.read_csv('AirlineTweets.csv',delimiter=",")

df_train,df_test = train_test_split(df, test_size=0.3, random_state=42)

tfidf_vectorizer = TfidfVectorizer(lowercase=True,stop_words='english')

Xtrain = tfidf_vectorizer.fit_transform(df_train['text'])
Xtest = tfidf_vectorizer.transform(df_test['text'])

Ytrain = df_train['target']
Ytest = df_test['target']

# compute class weight to counteract class imbalannce
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Ytrain), y=Ytrain)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}


# Number of classes
K = df['target'].value_counts().value_counts().sum()
print(f"Num of classes: {K}")

# input dimensions
D = Xtrain.shape[1]
print(f"Iput D {D}")

# tensorflow needs array in non sparse form
Xtrain = Xtrain.toarray()
Xtest = Xtest.toarray()


# These seem to be helping, not sure why
# Convert to NumPy arrays if they are not already
Ytrain = Ytrain.to_numpy() if hasattr(Ytrain, "to_numpy") else Ytrain
Ytest = Ytest.to_numpy() if hasattr(Ytest, "to_numpy") else Ytest


# Build model
i = Input(shape=(D,))
x = Dense(500,activation='relu')(i)
x = Dense(300,activation='relu')(x)
x = Dense(K,activation='softmax')(x) # softmax in loss

model = Model(i,x)
model.summary()

In [None]:
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
    )



r = model.fit(
    Xtrain,Ytrain,
    validation_data=(Xtest,Ytest),
    epochs = 7,
    class_weight = class_weight_dict,
    batch_size = 128


)

# plot loss/iteration

plt.plot(r.history['loss'],label = 'train_loss')
plt.plot(r.history['val_loss'],label = 'val_loss')
plt.legend()

In [None]:
#Calculate AUC Score
P_train = tf.nn.softmax(model.predict(Xtrain))
P_test = tf.nn.softmax(model.predict(Xtest))


print("Train AUC:",roc_auc_score(Ytrain,P_train,multi_class='ovo'))
print("Test AUC:",roc_auc_score(Ytest,P_test,multi_class='ovo'))


In [None]:
# Sample custom text for prediction
custom_text = ["it was mid", "it was awesome","worst ever","best ever service "]

# Transform the custom text using the same vectorizer
custom_text_vectorized = tfidf_vectorizer.transform(custom_text)
custom_text_vectorized = custom_text_vectorized.toarray()

# Make the prediction
predictions = tf.nn.softmax(model.predict(custom_text_vectorized))


# Get the class with the highest probability
predicted_classes = np.argmax(predictions, axis=1)

# Map the predicted classes to text labels
class_labels = ['negative', 'positive', 'neutral']
predicted_labels = [class_labels[i] for i in predicted_classes]



# Display the predictions
for text, prediction in zip(custom_text, predicted_labels):
    print(f"Text: {text}\nPrediction: {prediction}\n")
