In [4]:
import pandas as pd

# Load training and test data
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

# Check the structure
print(train_df.head())
print(train_df.info())
print(train_df.shape)

                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1
<class 'pandas.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   text    25000 non-null  str  
 1   label   25000 non-null  int64
dtypes: int64(1), str(1)
memory usage: 32.0 MB
None
(25000, 2)


In [5]:
# Check column names
print(train_df.columns)

# Check for missing values
print(train_df.isnull().sum())

# Check label distribution (should be balanced 50/50)
print(train_df['label'].value_counts())  # or whatever the column name is

Index(['text', 'label'], dtype='str')
text     0
label    0
dtype: int64
label
1    12500
0    12500
Name: count, dtype: int64


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Transform text to features
X_train = vectorizer.fit_transform(train_df['text'])
y_train = train_df['label']

X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']

In [7]:
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8391
              precision    recall  f1-score   support

           0       0.83      0.86      0.84     12500
           1       0.85      0.82      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [8]:
# Save both vectorizer and model together
import joblib

# Create a pipeline or save both
from sklearn.pipeline import Pipeline

final_model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', model)
])

joblib.dump(final_model, 'skills_assessment.joblib')

['skills_assessment.joblib']