In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_excel('Data_Train.xlsx')
test_data = pd.read_excel('Data_Test.xlsx')

In [3]:
train_data.shape

(7628, 2)

In [4]:
# combined_df = pd.concat([train_data, test_data], ignore_index=True)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [6]:
 # Preprocessing: Tokenization and Vectorization
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit_transform(train_data['STORY'])
y = vectorizer.transform(test_data['STORY'])

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(x, train_data['SECTION'], test_size=0.2)

# Define a list of models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Multilayer Perceptron', MLPClassifier(max_iter=1000))
]

best_model_name = ""
best_model_score = 0

# Loop through models and evaluate
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    print(f'{name} Score: {score}')
    
    if score > best_model_score:
        best_model_score = score
        best_model_name = name

print(f'\nThe best model is {best_model_name} with a score of {best_model_score}')



Logistic Regression Score: 0.9593709043250328
Random Forest Score: 0.9515072083879423
Support Vector Machine Score: 0.9311926605504587
K-Nearest Neighbors Score: 0.63564875491481
Gradient Boosting Score: 0.9325032765399738
Multilayer Perceptron Score: 0.9678899082568807

The best model is Multilayer Perceptron with a score of 0.9678899082568807


In [9]:
x

<7628x32518 sparse matrix of type '<class 'numpy.int64'>'
	with 377305 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Preprocessing: Tokenization and Vectorization
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['STORY'])
X_test = vectorizer.transform(test_data['STORY'])

# Define the target variable for X_train
y_train = train_data['SECTION']

# Initialize and train the model
model = MLPClassifier(max_iter=1000)
model.fit(X_train, y_train)

# Predict SECTION for y_test
y_pred = model.predict(X_test)

# The variable y_pred contains the predicted SECTION for test_data['STORY']


In [7]:
output_data = pd.DataFrame({'SECTION': y_pred})

output_data.to_csv('predicted_sections.csv', index=False)

In [27]:
print(y_pred)

[[2.5050725e-19 1.0000000e+00 8.3883091e-22 5.1179241e-20]
 [2.3157797e-03 2.0017563e-03 9.9527007e-01 4.1241152e-04]
 [7.2889289e-19 1.0000000e+00 7.1107095e-21 2.8929182e-18]
 ...
 [2.5733907e-08 9.9999988e-01 1.8212249e-09 7.7343032e-08]
 [9.5538270e-01 8.3472170e-03 2.5145590e-08 3.6270056e-02]
 [1.9022194e-05 9.9968755e-01 1.7284764e-06 2.9172120e-04]]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the data

train_data = pd.read_excel('Data_Train.xlsx')
test_data = pd.read_excel('Data_Test.xlsx')

# Preprocess the data (remove punctuation, stopwords, etc.)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['STORY'], train_data['SECTION'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_data['STORY'])

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')

# Make predictions on the test data
test_predictions = clf.predict(X_test_tfidf)

# The 'test_predictions' variable now contains the predicted categories for the test data.


Accuracy: 0.9475753604193972


In [4]:
test_predictions

array([1, 2, 1, ..., 1, 3, 1], dtype=int64)

In [5]:
output_data = pd.DataFrame({'SECTION': test_predictions})

output_data.to_csv('predicted_sections.csv', index=False)