In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import sqlite3

# Step 1: Load your CSV data into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/CS5588_Assignment2/dataset.csv')

# Step 2: Connect to SQLite database (or create a new one if it doesn't exist)
conn = sqlite3.connect('database_0912.db')

# Step 3: Load data into the database
# This will create a new table named 'your_table_name' and insert the data.
# If the table already exists, you can use the parameter `if_exists='replace'` to replace it,
# `if_exists='append'` to add more data, or `if_exists='fail'` to throw an error.
df.to_sql('newsdata', conn, if_exists='replace', index=False)

# Optionally, you can verify by reading from the database
df = pd.read_sql('SELECT * FROM newsdata', conn)

# Close the connection to the database
conn.close()


In [3]:
df

Unnamed: 0,Raytheon Shows Its Strength Heading Into United Technologies Deal,neutral
0,We're Keeping An Eye On Sangamo Therapeutics's...,neutral
1,The company 's share is quoted on NASDAQ OMX H...,neutral
2,China Mengniu Dairy : DISCLOSEABLE TRANSACTION...,neutral
3,The acquisition price was not disclosed .,neutral
4,Lagarde Says ECB's Options Limited by Low Rate...,negative
...,...,...
60582,"BP, Statoil, to Withdraw Staff From Algeria Fo...",negative
60583,"The copying , republication or redistribution ...",neutral
60584,Operating profit margin increased from 11.2 % ...,positive
60585,$vxx adding to position here !,positive


In [4]:
df.columns = ['input', 'output']
df['output'] = df['output'].apply(lambda x:x.strip())
df

Unnamed: 0,input,output
0,We're Keeping An Eye On Sangamo Therapeutics's...,neutral
1,The company 's share is quoted on NASDAQ OMX H...,neutral
2,China Mengniu Dairy : DISCLOSEABLE TRANSACTION...,neutral
3,The acquisition price was not disclosed .,neutral
4,Lagarde Says ECB's Options Limited by Low Rate...,negative
...,...,...
60582,"BP, Statoil, to Withdraw Staff From Algeria Fo...",negative
60583,"The copying , republication or redistribution ...",neutral
60584,Operating profit margin increased from 11.2 % ...,positive
60585,$vxx adding to position here !,positive


# Modelling part

In [5]:
!pip install keras
!pip install tensorflow



In [6]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer # Import Tokenizer from tensorflow.keras.preprocessing.text
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

# Data Processing

In [7]:
def remove_tags(string):
    # Remove HTML tags
    result = re.sub(r'<[^>]+>', '', string)
    # Remove URLs
    result = re.sub(r'https?://\S+|www\.\S+', '', result)
    # Remove non-alphanumeric characters (keeping spaces)
    result = re.sub(r'[^a-zA-Z0-9\s]', ' ', result)
    # Convert to lowercase
    result = result.lower()
    # Remove extra spaces (optional)
    result = re.sub(r'\s+', ' ', result).strip()
    return result

# Apply the function to the 'input' column of the DataFrame
df['input'] = df['input'].apply(remove_tags)

In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['input'] = df['input'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
df['input'] = df.input.apply(lemmatize_text)
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,input,output
0,keeping eye sangamo therapeutic nasdaq sgmo ca...,neutral
1,company share quoted nasdaq omx helsinki rauta...,neutral
2,china mengniu dairy discloseable transaction a...,neutral
3,acquisition price disclosed,neutral
4,lagarde say ecb option limited low rate low in...,negative
...,...,...
60582,bp statoil withdraw staff algeria following ro...,negative
60583,copying republication redistribution afx news ...,neutral
60584,operating profit margin increased 11 2 11 7,positive
60585,vxx adding position,positive


In [9]:
reviews = df['input'].values
labels = df['output'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [10]:
# Splitting training and testing dataset
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

# Modelling part

In [11]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [12]:
# Define model
model = keras.Sequential([
    # Embedding layer
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),

    # Bidirectional LSTM layer
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),

    # Additional LSTM layer
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),

    # Global max pooling layer to reduce dimensions
    keras.layers.GlobalMaxPooling1D(),

    # Dense hidden layer
    keras.layers.Dense(64, activation='relu'),

    # Dropout layer for regularization
    keras.layers.Dropout(0.5),

    # Output layer
    keras.layers.Dense(1, activation='sigmoid')
])

# Build the model
model.build(input_shape=(None, max_length))

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Display model summary
model.summary()



In [13]:
num_epochs = 2
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/2
[1m1278/1278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 29ms/step - accuracy: 0.4557 - loss: -140.3042 - val_accuracy: 0.5128 - val_loss: -2104.0273
Epoch 2/2
[1m1278/1278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 28ms/step - accuracy: 0.5109 - loss: -3751.1245 - val_accuracy: 0.5301 - val_loss: -9540.9404


In [28]:
model.save('my_model.h5')



In [14]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step
Accuracy of prediction on test set :  0.5354195550273981


In [22]:
predict_sequences = tokenizer.texts_to_sequences("Russia will be ‘at war’ with NATO if Ukraine long-range missile restrictions lifted, Putin warns")
predict_padded = pad_sequences(predict_sequences, padding='post', maxlen=10)

prediction = model.predict(predict_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", pred_labels)

In [25]:
prediction = model.predict(predict_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", pred_labels)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Accuracy of prediction on test set :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [27]:
prediction = model.predict(predict_padded)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [31]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.1-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [34]:
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json


tokenizer.fit_on_texts(train_sentences)

# Save the tokenizer
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer.to_json(), f)

In [36]:
import gradio as gr
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model and tokenizer
model = load_model('my_model.h5')

with open('tokenizer.json') as f:
    tokenizer_json = json.load(f)
    tokenizer = tokenizer_from_json(tokenizer_json)

# Define the prediction function
def predict_sentiment(text):
    # Preprocess the input text
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=1000)

    # Predict sentiment
    prediction = model.predict(padded_sequences)

    # Convert probability to sentiment label (0 or 1)
    sentiment = 'Positive' if prediction[0][0] >= 0.5 else 'Negative'

    return sentiment

# Set up Gradio interface
iface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs="text",
    title="Sentiment Analysis",
    description="Enter a text and get the sentiment prediction (Positive/Negative)."
)

# Launch the interface
iface.launch()




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://086392372f9063b19e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


