In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from catboost import CatBoostClassifier as CB

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-dataset/test.csv')

target_train = train_df['target']
train_df.head()

In [None]:
train_df['Target'] = train_df['target'].copy()
train_df.pop('target')

train_df.fillna('NaN', inplace=True)
test_df.fillna('NaN', inplace=True)

In [None]:
def clean_text(text):
    # Remove links
    text = re.sub(r'http[s]?://\S+', '', text)  # Removes URLs
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removes punctuation and numbers
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
%%time
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

lem = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in set(stopwords.words('english'))]
    lemmatized_text = ' '.join([lem.lemmatize(token) for token in tokens])

    return lemmatized_text

In [None]:
to_change = ['text'] #'keyword', 'location']
for col in to_change:
    train_df[col] = train_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)
    
    train_df[col] = train_df[col].apply(lemmatize_text)
    test_df[col] = test_df[col].apply(lemmatize_text)

In [None]:
train_df.head()

In [None]:
test_df.head()

**TF-IDF Vectorization**

In [None]:
vect_text = TfidfVectorizer(max_features=5000, stop_words='english')
vect_keyword = TfidfVectorizer(max_features=500, stop_words='english')
vect_location = TfidfVectorizer(max_features=500, stop_words='english')

def tfidf(data, vect, train=1):
    tfidf = vect.fit_transform(data) if train else vect.transform(data)
    
    features = vect.get_feature_names_out()
    score = tfidf.toarray()

    return pd.DataFrame({features[i] : score[:, i] for i in range(len(features))})

def create_features(df, train=1):
    text_tfidf = tfidf(df['text'], vect_text, train)
    #keyword_tfidf = tfidf(df['keyword'], vect_keyword, train)
    #location_tfidf = tfidf(df['location'], vect_location, train)
    df = pd.concat((text_tfidf,df), axis=1)
    
    df.drop(['text', 'id', 'keyword','location'], axis=1, inplace=True)
    
    return df

In [None]:
%%time
ids = test_df['id'].copy()

train_df = create_features(train_df, train=1)
test_df = create_features(test_df, train=0)

uinque_index = lambda cols: [(f"{col}_{i}" if list(cols).count(col) > 1 else col) for i, col in enumerate(cols)]

train_df.columns = uinque_index(train_df.columns)
test_df.columns = uinque_index(test_df.columns)

In [None]:
train_df.columns.values.tolist().index('Target')

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df['Target'].describe()

In [None]:
xtest, xval, ytest, yval = \
train_test_split(train_df, target_train, test_size=0.2, shuffle=True, stratify=train_df['Target'])

for df in [xtest, xval, train_df]:
    df.pop('Target')
    
def is_dropping_good(xtest, xval, ytest, yval):
    clf = CB(iterations=100, task_type='CPU', verbose=False)

    clf.fit(xtest, ytest)

    print(f"f1 with all features: {f1_score(yval, clf.predict(xval))}")
    print(pd.DataFrame(clf.feature_importances_).describe())
    sns.scatterplot(pd.DataFrame(clf.feature_importances_))
    plt.show()
    
    print('-'*100)

    print("shapes before", xval.shape, xtest.shape)
    
    fimp = clf.feature_importances_.argsort()
    xval.drop([xval.columns[i] for i in fimp[:len(fimp) // 2]], axis=1, inplace=True)
    xtest.drop([xtest.columns[i] for i in fimp[:len(fimp) // 2]], axis=1, inplace=True)
    
    print("shapes after", xval.shape, xtest.shape)

    print('-'*100)

    clf = CB(iterations=100, task_type='CPU', verbose=False)

    clf.fit(xtest, ytest)

    print(f"f1 after dropping features: {f1_score(yval, clf.predict(xval))}")
    
    return fimp

In [None]:
#clf = MultinomialNB()
clf = CB(iterations=1000, task_type='CPU', verbose=False)
clf.fit(train_df, target_train)

f1_score(target_train, clf.predict(train_df))

In [None]:
submission = pd.DataFrame({
    'id':ids,
    'target':clf.predict(test_df)
})

submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
submission.describe()

In [None]:
pd.read_csv('/kaggle/input/nlp-dataset/sample_submission.csv').head()