# Kaggle submission notebook  

This notebook is to make submission on Kaggle based on the research and experiments performed in the main notebook.  


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", UserWarning)


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [31]:
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

Train Data Shape: (17307, 3)
Test Data Shape: (3, 2)


In [32]:
print(test_df)

  essay_id                                          full_text
0  000d118  Many people have car where they live. The thin...
1  000fe60  I am a scientist at NASA that is discussing th...
2  001ab80  People always wish they had the same technolog...


In [None]:
%%time

def clean_text(text):
    """
    Function to clean the text data by:
        - Lowercasing the text
        - Removing URLs 
        - Removing special characters and punctuation
        - Tokenizing the text
        - Removing stopwords
        - Lemmatizing the tokens
    """
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

tqdm.pandas(desc="Cleaning Text Data")
train_df['cleaned_text'] = train_df['full_text'].progress_apply(clean_text)

Cleaning Text Data: 100%|██████████| 17307/17307 [00:40<00:00, 423.23it/s]

CPU times: user 40 s, sys: 871 ms, total: 40.9 s
Wall time: 40.9 s





In [None]:
def vectorize_text(train, column='cleaned_text', num_features=1000):
    """
    To apply TF-IDF vectorization to the text data.
    """
    vectorizer = TfidfVectorizer(max_features=num_features)
    X_train_vec = vectorizer.fit_transform(train[column])
    return X_train_vec,vectorizer

set_of_tf_idf_features = {}

num_features=3000
X_train_vec,  vectorizer = vectorize_text(train_df, num_features=num_features)
set_of_tf_idf_features[3000] = {
    'X_train': X_train_vec,
    'vectorizer': vectorizer
}

In [35]:
%%time
# best model LightGBM Regressor	TF-IDF 3000 features
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
test_df['cleaned_text'] = test_df['full_text'].apply(clean_text)
X_test_vec = set_of_tf_idf_features[3000]['vectorizer'].transform(test_df['cleaned_text'])

# retraining the best model on the full training set
parameters = {'num_leaves': 12
              , 'max_depth': -1
              , 'learning_rate': 0.1
              , 'n_estimators': 500}
final_model = LGBMRegressor(**parameters
                            , class_weight='balanced'
                            , objective='regression'
                            , random_state=42
                            , verbose=-1)

final_model.fit(X_train_vec, train_df['score'])

# Making predictions on the test set
test_predictions = final_model.predict(X_test_vec)
test_predictions = np.round(test_predictions).astype(int)
test_predictions = np.clip(test_predictions, 1, 6)

submission = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': test_predictions
})
submission.to_csv('submission.csv', index=False)

CPU times: user 1min 1s, sys: 131 ms, total: 1min 1s
Wall time: 31.1 s
