In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def preprocess_data(train_df, test_df):
    # Initializing the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fitting the vectorizer on the training data
    vectorizer.fit(train_df['text'])

    # Transforming the training and test data using the fitted vectorizer
    train_features = vectorizer.transform(train_df['text'])
    test_features = vectorizer.transform(test_df['text'])

    return train_features, test_features, vectorizer


In [4]:
train_features, test_features, vectorizer = preprocess_data(train_df, test_df)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)
lr_f1 = f1_score(y_val, lr_preds, average='weighted')

print("Logistic Regression F1 Score:", lr_f1)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have a DataFrame named train_df with columns 'text' and 'lang_id'
# If you don't have a DataFrame yet, load your data using pandas read_csv or other methods

# Sample data loading
# train_df = pd.read_csv('your_train_dataset.csv')

# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(
#     train_df['text'], train_df['lang_id'], test_size=0.2, random_state=42
# )

# # Text vectorization using TF-IDF
# vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
# X_train = vectorizer.fit_transform(X_train)
# X_val = vectorizer.transform(X_val)

# Initialize the Naive Bayes model
nb = MultinomialNB()

# Define the parameter grid
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5],
              'fit_prior': [True, False]}

# Create a scorer for f1_weighted
f1_scorer = make_scorer(f1_score, average='weighted')

# Initialize GridSearchCV
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to train the final model
best_nb_model = MultinomialNB(alpha=best_params['alpha'], fit_prior=best_params['fit_prior'])
best_nb_model.fit(X_train, y_train)

# Predictions on the validation set
y_pred = best_nb_model.predict(X_val)

# Evaluate the performance
nb_f1 = f1_score(y_val, y_pred, average='weighted')
print("Improved Naive Bayes F1 Score:", nb_f1)
print("Best Parameters:", best_params)

In [None]:
# Naive Bayes model
nb = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5]}
f1_scorer = make_scorer(f1_score, average='weighted')
grid_search_nb = GridSearchCV(nb, param_grid, cv=StratifiedKFold(n_splits=5), scoring=f1_scorer, n_jobs=-1)
grid_search_nb.fit(X_train, y_train)
best_nb_model = grid_search_nb.best_estimator_

# Support Vector Machine model
svm = SVC(probability=True)
param_grid_svm = {'C': [1, 10, 100], 'gamma': [0.01, 0.1, 1]}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=StratifiedKFold(n_splits=5), scoring=f1_scorer, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)
best_svm_model = grid_search_svm.best_estimator_

# Random Forest model
rf = RandomForestClassifier()
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=StratifiedKFold(n_splits=5), scoring=f1_scorer, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

# Ensemble of models
voting_classifier = VotingClassifier(estimators=[
    ('nb', best_nb_model),
    ('svm', best_svm_model),
    ('rf', best_rf_model)
], voting='soft')

# Train the ensemble model
voting_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_pred_ensemble = voting_classifier.predict(X_val)

# Evaluate the performance
ensemble_f1 = f1_score(y_val, y_pred_ensemble, average='weighted')
print("Ensemble F1 Score:", ensemble_f1)

In [None]:
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = best_nb_model.predict(X_test)

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df