In [2]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation (substituting with a space so words don't concatenate together)
    text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))
    # Remove digits (substituting with a space so words don't concatenate together)
    text = text.translate(str.maketrans('', '', '0123456789'))
    return text

# Load the datasets
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')

# Preprocess the text
train_set['text'] = train_set['text'].apply(preprocess_text)
test_set['text'] = test_set['text'].apply(preprocess_text)

# Split the training data into features and labels
X_train = train_set['text']
y_train = train_set['lang_id']

# Define a Pipeline with TF-IDF Vectorizer and SVM Classifier
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3))),
    ('clf', SVC(kernel='linear', probability=True))
])

# Train the SVM model on the full training data
pipeline_svm.fit(X_train, y_train)

# Predict on the test set
test_predictions_svm = pipeline_svm.predict(test_set['text'])

# Prepare the submission dataframe
submission_svm = pd.DataFrame({'index': test_set['index'], 'lang_id': test_predictions_svm})

# Save the submission file
submission_file_path_svm = 'submission_svm.csv'
submission_svm.to_csv(submission_file_path_svm, index=False)
