In [1]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts

In [2]:
# importing the dataset
data_train = pd.read_json("skills_assessment_data/train.json")
data_test = pd.read_json("skills_assessment_data/test.json")

In [3]:
# top values of the data-set
data_train.head()


Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [4]:
data_test.head()

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [5]:
# shape of the data
data_train.shape


(25000, 2)

In [6]:
data_test.shape

(25000, 2)

In [7]:
# column names 
data_train.columns

Index(['text', 'label'], dtype='object')

In [8]:
data_test.columns

Index(['text', 'label'], dtype='object')

In [9]:
# count of unique values in the column
data_train['label'].value_counts()

label
1    12500
0    12500
Name: count, dtype: int64

In [10]:
data_test['label'].value_counts()

label
1    12500
0    12500
Name: count, dtype: int64

In [11]:
# top 10 elements of the dataset
data_train.head(10)

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
5,"This isn't the comedic Robin Williams, nor is ...",1
6,Yes its an art... to successfully make a slow ...,1
7,"In this ""critically acclaimed psychological th...",1
8,THE NIGHT LISTENER (2006) **1/2 Robin Williams...,1
9,"You know, Robin Williams, God bless him, is co...",1


In [12]:
# data from the bottom
data_train.tail(5)

Unnamed: 0,text,label
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0
24999,"This is one of the dumbest films, I've ever se...",0


In [13]:
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:clean_text1(x)

In [14]:
data_train['text']=pd.DataFrame(data_train.text.apply(cleaned1))
data_test['text']=pd.DataFrame(data_test.text.apply(cleaned1))

In [15]:
data_train.head(10)

Unnamed: 0,text,label
0,bromwell high is a cartoon comedy it ran at th...,1
1,homelessness or houselessness as george carlin...,1
2,brilliant overacting by lesley ann warren best...,1
3,this is easily the most underrated film inn th...,1
4,this is not the typical mel brooks film it was...,1
5,this isnt the comedic robin williams nor is it...,1
6,yes its an art to successfully make a slow pac...,1
7,in this critically acclaimed psychological thr...,1
8,the night listener robin williams toni colle...,1
9,you know robin williams god bless him is const...,1


In [16]:
data_test.head(10)

Unnamed: 0,text,label
0,i went and saw this movie last night after bei...,1
1,actor turned director bill paxton follows up h...,1
2,as a recreational golfer with some knowledge o...,1
3,i saw this film in a sneak preview and it is d...,1
4,bill paxton has taken the true story of the u...,1
5,i saw this film on september in indianapolis...,1
6,maybe im reading into this too much but i wond...,1
7,i felt this film did have many good qualities ...,1
8,this movie is amazing because the fact that th...,1
9,quitting may be as much about exiting a preord...,1


In [17]:
# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:clean_text2(x)

In [18]:
data_train['text']=pd.DataFrame(data_train.text.apply(cleaned2))
data_test['text']=pd.DataFrame(data_test.text.apply(cleaned2))
data_train.head(10)

Unnamed: 0,text,label
0,bromwell high is a cartoon comedy it ran at th...,1
1,homelessness or houselessness as george carlin...,1
2,brilliant overacting by lesley ann warren best...,1
3,this is easily the most underrated film inn th...,1
4,this is not the typical mel brooks film it was...,1
5,this isnt the comedic robin williams nor is it...,1
6,yes its an art to successfully make a slow pac...,1
7,in this critically acclaimed psychological thr...,1
8,the night listener robin williams toni colle...,1
9,you know robin williams god bless him is const...,1


In [19]:
xtrain = data_train.iloc[0:,0].values
ytrain = data_train.iloc[0:,1].values
xtest = data_test.iloc[0:,0].values
ytest = data_test.iloc[0:,1].values

In [20]:
#xtrain,xtest,ytrain,ytest = tts(x_train,y_train,test_size = 0.25,random_state = 225)

In [21]:
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline

In [22]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(xtrain,ytrain)

In [23]:
ypred=model.predict(xtest)

In [24]:
# model score
accuracy_score(ypred,ytest)

0.88132

In [25]:
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)

[[11006  1494]
 [ 1473 11027]]


In [26]:
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)

0.8812202249889908


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
#X = vectorizer.fit_transform(data_train["text"])
# Build the pipeline by combining vectorization and classification
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])
# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="f1",
    verbose=2, 
    n_jobs=-1   # Optional: speeds up by using all CPU cores
)

# Fit the grid search on the full dataset
grid_search.fit(xtrain, ytrain)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best model parameters: {'classifier__alpha': 1.0}


In [28]:
import joblib
model_filename = 'skills_assessment_md01.joblib'
# Save the trained model to a file for future use

#joblib.dump(model, model_filename)
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to skills_assessment_md01.joblib


In [33]:
loaded_model = joblib.load(model_filename)
predictions = loaded_model.predict(xtest)

In [34]:
predictions


array([1, 1, 1, ..., 1, 0, 1], shape=(25000,))

In [36]:
import requests
import json

# Define the URL of the API endpoint
url = "http://10.129.203.82:5000/api/upload"

# Path to the model file you want to upload
model_file_path = "skills_assessment_md.joblib"

# Open the file in binary mode and send the POST request
with open(model_file_path, "rb") as model_file:
    files = {"model": model_file}
    response = requests.post(url, files=files)

# Pretty print the response from the server
print(json.dumps(response.json(), indent=4))

{
    "accuracy": 1.0,
    "flag": "HTB{s3nt1m3nt_4n4lys1s_d4t4}"
}
