In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [2]:
train = pd.read_csv('train_3.csv')
test = pd.read_csv('test_ZUT1mqB.csv')

In [3]:
train.head()

Unnamed: 0,ID,Title,Domain
0,1,"What is good in a decision tree, a large or a ...",Techniques
1,2,Training data only contains single positive label,Techniques
2,3,Calculating percentage contribution of a negat...,Techniques
3,4,Unable to open solution checker!,Hackathons
4,5,User Name Change,Misc


In [4]:
train.isnull().sum()

ID         0
Title     11
Domain     0
dtype: int64

In [5]:
train.shape

(3845, 3)

In [6]:
train.dropna(inplace=True)  # dropping null rows

### Text preprocessing

In [7]:
# chaning to lower case and keeping only alphabets
def lower_alpha(s):
    s = s.lower()
    for i in s:
        if i.isalpha()==False and i.isspace()==False:
            s=s.replace(i,'')
    return s

In [8]:
# remove noise
def remove_noise(s):
    noise_list = list(stopwords.words('english'))
    words = s.split()
    noise_free_words = [w for w in words if w not in noise_list]
    noise_free_statement = ' '.join(noise_free_words)
    return noise_free_statement  
    

In [9]:
# Lemmatization
def stemmer(s):
    ss = WordNetLemmatizer()
    s = ss.lemmatize(s)
    return s
    

In [10]:
def pre_processing(s):
    s = lower_alpha(s)
    s = remove_noise(s)
    s = stemmer(s)
    return s

In [11]:
pre_processing('Python String isspace() method returns “True” if all characters in the string are whitespace characters, Otherwise, It returns “False”. This function is used to check if the argument contains all whitespace characters, such as:')

'python string isspace method returns true characters string whitespace characters otherwise returns false function used check argument contains whitespace characters'

### merging the data

In [12]:
data = train.merge(test,how='outer')

In [13]:
data

Unnamed: 0,ID,Title,Domain
0,1,"What is good in a decision tree, a large or a ...",Techniques
1,2,Training data only contains single positive label,Techniques
2,3,Calculating percentage contribution of a negat...,Techniques
3,4,Unable to open solution checker!,Hackathons
4,5,User Name Change,Misc
...,...,...,...
5478,5490,Should we take all the variables from the trai...,
5479,5491,Time Series Forecasting and reducing it to sta...,
5480,5492,Data Visualization and text analysis (twitter ...,
5481,5493,Cross Validation strategy for stacked models,


In [14]:
data['Title'] = data['Title'].apply(pre_processing)

In [15]:
data

Unnamed: 0,ID,Title,Domain
0,1,good decision tree large small leaf size,Techniques
1,2,training data contains single positive label,Techniques
2,3,calculating percentage contribution negative c...,Techniques
3,4,unable open solution checker,Hackathons
4,5,user name change,Misc
...,...,...,...
5478,5490,take variables training data randomforest model,
5479,5491,time series forecasting reducing stationary se...,
5480,5492,data visualization text analysis twitter mining,
5481,5493,cross validation strategy stacked models,


In [16]:
# Vectorize
count = CountVectorizer(stop_words='english')
vec_terms = count.fit_transform(data['Title'])

In [17]:
col = count.get_feature_names_out()

In [18]:
train_matrix = pd.DataFrame(vec_terms.toarray(),columns=col)

In [19]:
train_matrix

Unnamed: 0,aaply,ab,abc,abinbev,able,abnormal,abroad,abruptly,absence,absenteeism,...,zero,zerofrequency,zeros,zip,zs,zscore,zstatistic,ztestttest,zxc,zxcscript
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
pca = PCA(n_components=1500)
pca.fit_transform(train_matrix)
print(pca.explained_variance_ratio_.sum())

0.9089226197731192


In [21]:
train_matrix = pd.DataFrame(pca.fit_transform(train_matrix))

In [22]:
train_matrix.shape

(5483, 1500)

In [23]:
le = LabelEncoder()

In [24]:
train_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,-0.205624,0.058571,-0.051421,-0.055213,0.014575,-0.134705,-0.061256,-0.072717,-0.073181,-0.125678,...,0.050900,0.003301,0.021703,-0.022282,-0.019340,-0.012108,0.007953,-0.043134,-0.020181,-0.025030
1,0.761258,-0.054988,-0.052165,-0.121936,-0.004112,-0.033614,-0.031553,-0.002996,-0.060930,0.050291,...,-0.005352,0.013232,0.017510,-0.013203,0.033570,0.002396,-0.005039,0.003821,-0.018678,0.051947
2,-0.206848,0.050461,-0.043814,-0.073492,0.017530,-0.088396,-0.028085,-0.042387,-0.009349,-0.042661,...,-0.001343,0.016503,-0.012240,-0.007581,0.017941,-0.013456,-0.039440,-0.028620,-0.032484,-0.022079
3,-0.204210,0.053352,-0.038774,-0.067058,0.043593,-0.126154,-0.047032,-0.021460,-0.026040,-0.082659,...,-0.003906,-0.017756,0.017169,-0.012809,-0.002114,-0.009787,0.014093,-0.018195,0.015447,0.005617
4,-0.197950,0.062790,-0.031550,-0.056928,0.028537,-0.098474,-0.037216,-0.039283,-0.037073,-0.069866,...,-0.012174,-0.011639,-0.010506,-0.014176,0.001941,0.000294,0.010111,-0.007723,-0.000379,0.007247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5478,0.693720,-0.147717,-0.023183,-0.402543,0.007516,0.777867,-0.005344,0.202549,-0.602101,0.090422,...,-0.005050,0.007301,-0.004027,0.008148,0.002382,-0.000405,0.000938,-0.015504,0.001365,-0.005060
5479,-0.193517,-0.069247,-0.127627,-0.183994,0.534240,-0.081987,1.874550,0.530867,0.298938,0.027546,...,-0.012938,-0.006874,-0.005300,-0.008370,0.005564,0.010829,-0.020271,-0.030564,-0.006746,0.001982
5480,0.798115,-0.107157,-0.090237,-0.108700,-0.013555,-0.102584,0.115700,-0.096061,0.090973,-0.170456,...,0.000878,-0.002762,-0.005737,0.002390,0.005618,0.002536,-0.000647,0.010675,-0.011705,0.000795
5481,-0.216356,0.048812,-0.028418,-0.104392,0.030187,-0.030956,-0.048552,-0.021155,-0.025976,-0.107619,...,0.003215,-0.003057,-0.008892,0.011838,0.008542,-0.009742,0.003439,0.011444,-0.000425,-0.002232


In [25]:
train_matrix['Domain'] = le.fit_transform(pd.DataFrame(data['Domain']))

  y = column_or_1d(y, warn=True)


In [26]:
le.classes_

array(['Career', 'Hackathons', 'Misc', 'Other', 'Resources', 'Techniques',
       'Tools', nan], dtype=object)

In [27]:
train_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1491,1492,1493,1494,1495,1496,1497,1498,1499,Domain
0,-0.205624,0.058571,-0.051421,-0.055213,0.014575,-0.134705,-0.061256,-0.072717,-0.073181,-0.125678,...,0.003301,0.021703,-0.022282,-0.019340,-0.012108,0.007953,-0.043134,-0.020181,-0.025030,5
1,0.761258,-0.054988,-0.052165,-0.121936,-0.004112,-0.033614,-0.031553,-0.002996,-0.060930,0.050291,...,0.013232,0.017510,-0.013203,0.033570,0.002396,-0.005039,0.003821,-0.018678,0.051947,5
2,-0.206848,0.050461,-0.043814,-0.073492,0.017530,-0.088396,-0.028085,-0.042387,-0.009349,-0.042661,...,0.016503,-0.012240,-0.007581,0.017941,-0.013456,-0.039440,-0.028620,-0.032484,-0.022079,5
3,-0.204210,0.053352,-0.038774,-0.067058,0.043593,-0.126154,-0.047032,-0.021460,-0.026040,-0.082659,...,-0.017756,0.017169,-0.012809,-0.002114,-0.009787,0.014093,-0.018195,0.015447,0.005617,1
4,-0.197950,0.062790,-0.031550,-0.056928,0.028537,-0.098474,-0.037216,-0.039283,-0.037073,-0.069866,...,-0.011639,-0.010506,-0.014176,0.001941,0.000294,0.010111,-0.007723,-0.000379,0.007247,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5478,0.693720,-0.147717,-0.023183,-0.402543,0.007516,0.777867,-0.005344,0.202549,-0.602101,0.090422,...,0.007301,-0.004027,0.008148,0.002382,-0.000405,0.000938,-0.015504,0.001365,-0.005060,7
5479,-0.193517,-0.069247,-0.127627,-0.183994,0.534240,-0.081987,1.874550,0.530867,0.298938,0.027546,...,-0.006874,-0.005300,-0.008370,0.005564,0.010829,-0.020271,-0.030564,-0.006746,0.001982,7
5480,0.798115,-0.107157,-0.090237,-0.108700,-0.013555,-0.102584,0.115700,-0.096061,0.090973,-0.170456,...,-0.002762,-0.005737,0.002390,0.005618,0.002536,-0.000647,0.010675,-0.011705,0.000795,7
5481,-0.216356,0.048812,-0.028418,-0.104392,0.030187,-0.030956,-0.048552,-0.021155,-0.025976,-0.107619,...,-0.003057,-0.008892,0.011838,0.008542,-0.009742,0.003439,0.011444,-0.000425,-0.002232,7


In [28]:
train = train_matrix[train_matrix['Domain']!=7]
test = train_matrix[train_matrix['Domain']==7]

In [29]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1491,1492,1493,1494,1495,1496,1497,1498,1499,Domain
3834,-0.194835,0.049942,-0.026708,-0.054210,0.027321,-0.096634,-0.036124,-0.025297,-0.022697,-0.073486,...,0.011801,-0.002494,0.009284,-0.012768,-0.015540,-0.000874,0.015902,-0.023609,-0.020223,7
3835,-0.234326,-0.376910,-0.148009,0.482720,0.632864,0.104521,-0.199187,-0.114990,0.062837,-0.128547,...,0.011736,0.005626,0.016391,-0.009650,-0.006652,-0.016744,-0.015022,0.010904,0.010727,7
3836,-0.193038,0.024240,-0.073973,-0.161800,0.013492,-0.050877,-0.028799,-0.047099,-0.065627,0.129283,...,-0.004815,0.004252,-0.001605,-0.007860,-0.005261,-0.018939,-0.007735,-0.009331,0.002830,7
3837,-0.212639,-0.015865,-0.064352,-0.066651,-0.021129,-0.120222,-0.071383,0.077677,-0.029803,-0.020764,...,0.033608,-0.018244,0.002979,-0.026528,-0.041334,-0.027437,-0.059069,-0.022352,0.060244,7
3838,-0.206676,0.046339,-0.050012,-0.056649,0.041579,-0.126917,-0.063397,-0.006558,-0.027963,-0.076808,...,-0.004685,0.014793,-0.008034,0.003285,-0.027613,-0.005585,-0.006303,0.030014,-0.009760,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5478,0.693720,-0.147717,-0.023183,-0.402543,0.007516,0.777867,-0.005344,0.202549,-0.602101,0.090422,...,0.007301,-0.004027,0.008148,0.002382,-0.000405,0.000938,-0.015504,0.001365,-0.005060,7
5479,-0.193517,-0.069247,-0.127627,-0.183994,0.534240,-0.081987,1.874550,0.530867,0.298938,0.027546,...,-0.006874,-0.005300,-0.008370,0.005564,0.010829,-0.020271,-0.030564,-0.006746,0.001982,7
5480,0.798115,-0.107157,-0.090237,-0.108700,-0.013555,-0.102584,0.115700,-0.096061,0.090973,-0.170456,...,-0.002762,-0.005737,0.002390,0.005618,0.002536,-0.000647,0.010675,-0.011705,0.000795,7
5481,-0.216356,0.048812,-0.028418,-0.104392,0.030187,-0.030956,-0.048552,-0.021155,-0.025976,-0.107619,...,-0.003057,-0.008892,0.011838,0.008542,-0.009742,0.003439,0.011444,-0.000425,-0.002232,7


In [30]:
x = train.drop(columns=['Domain'])
y = train['Domain']

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7)

In [32]:
print(x_train.shape,y_train.shape)

(2683, 1500) (2683,)


In [33]:
xgb = XGBClassifier(n_estimators=200)

In [34]:
xgb.fit(x_train,y_train)

In [35]:
y_pred = xgb.predict(x_test)

In [36]:
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
1874,-0.199098,0.053989,-0.037960,-0.062003,0.031714,-0.107769,-0.034394,-0.032789,-0.024692,-0.085021,...,0.012436,0.017797,0.002494,-0.005375,0.009000,0.021856,-0.015386,-0.009574,-0.003000,0.007038
3177,-0.195359,0.065580,-0.031725,-0.060757,0.028843,-0.096457,-0.032018,-0.035369,-0.023909,-0.071892,...,0.053864,-0.008550,-0.051275,-0.003450,-0.032948,-0.017349,-0.024104,-0.013785,-0.029336,-0.112041
129,-0.201016,0.062638,-0.030722,-0.069205,0.084260,-0.127933,-0.072281,0.048487,-0.015338,-0.129242,...,0.017999,0.082130,-0.020405,0.021429,-0.008796,0.013705,0.011658,-0.030552,0.024575,0.016706
1414,-0.279311,-0.506239,-0.191538,0.476084,-0.813111,-0.119695,-0.066350,0.740871,0.065097,0.079921,...,0.002340,-0.016693,-0.025615,-0.018892,-0.024622,0.010864,-0.044581,0.009988,-0.007523,0.041364
1651,-0.292016,-0.026293,-0.142846,-0.421182,-0.274979,0.614413,-0.094270,-0.359151,0.924668,0.004767,...,0.004259,-0.006232,-0.033249,-0.005909,0.004575,0.004040,0.000032,-0.016861,-0.002925,-0.003109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,-0.211265,0.058909,-0.049297,-0.082947,0.022928,-0.094231,-0.049490,-0.058099,-0.014462,-0.049285,...,0.016308,0.004670,0.001427,-0.019507,-0.004432,-0.004536,0.006107,0.017313,0.017865,-0.002908
3708,-0.207813,0.045969,-0.045638,-0.061545,0.049473,-0.102756,-0.034252,-0.061658,-0.054333,-0.059857,...,0.031163,0.035960,-0.025212,0.024962,-0.052847,0.029114,-0.003458,-0.017215,-0.014023,-0.027139
3763,-0.182352,0.003780,-0.065035,-0.088051,0.183424,-0.097444,0.523193,0.125681,0.078657,-0.042624,...,0.015227,0.003525,0.025293,-0.001254,0.000800,-0.006926,0.024042,-0.006486,-0.007633,0.002852
3529,-0.198277,0.057825,-0.031084,-0.057722,0.028861,-0.095811,-0.032358,-0.033332,-0.026490,-0.072614,...,0.001887,0.002544,-0.029200,-0.015230,0.006563,-0.007960,-0.000522,0.031584,0.030133,0.036522


In [37]:
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average='macro'))

0.6281494352736751
0.3475571665862289


In [38]:
pred = xgb.predict(test.drop(columns=['Domain']))

In [39]:
test = pd.read_csv('test_ZUT1mqB.csv')

In [40]:
test['Domain'] = pred

In [41]:
test['Domain'] = le.inverse_transform(test['Domain'])

In [42]:
test[['ID','Domain']].to_csv('final_submission_xgb.csv',index=False)