In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix


url_train = "https://raw.githubusercontent.com/Ishaanshri95/datasets/main/data/kaggle/kaggle2/train.csv"
url_test  = "https://raw.githubusercontent.com/Ishaanshri95/datasets/main/data/kaggle/kaggle2/test.csv"

data = pd.read_csv(url_train)
data_test = pd.read_csv(url_test)
target = data['Prediction']

data_ids = data['id']
data_test_ids = data_test['id']
data.drop(columns='id', inplace=True, axis=1)
data_test.drop(columns='id', inplace=True, axis=1)

data['text'] = data['comment_text']
data_test['text'] = data_test['comment_text']
data.drop(columns='comment_text', inplace=True, axis=1)
data_test.drop(columns='comment_text', inplace=True, axis=1)

data.drop(columns='Prediction', inplace=True, axis=1)
print(data.shape, target.shape)
print(data.info())

(111723, 1) (111723,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111723 entries, 0 to 111722
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    111723 non-null  object
dtypes: object(1)
memory usage: 873.0+ KB
None


In [2]:
def gen_submission(output_labels):
  submission = pd.DataFrame()
  predicteds = [str(f'{list(output_label)}'.replace(', ', ',').replace('[', '"').replace(']', '"')) for index, output_label in enumerate(output_labels)]
  ids = [index for index, output_label in enumerate(output_labels)]
  submission['ID']         = np.array(ids)
  submission['Prediction'] = np.array(predicteds)
  submission.to_csv('submission_temp.csv', index=False)

  fin = open("submission_temp.csv", "rt")
  #output file to write the result to
  fout = open("submission.csv", "wt")
  #for each line in the input file
  for line in fin:
    #read replace the string and write to output file
    fout.write(line.replace('"""', '"'))
  #close input and output files
  fin.close()
  fout.close()

  print(submission)
  

data

Unnamed: 0,text
0,Explanation\nWhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ..."
4,"You, sir, are my hero. Any chance you remember..."
...,...
111718,"""\n Auto guides and the motoring press are not..."
111719,"""\nplease identify what part of BLP applies be..."
111720,Catalan independentism is the social movement ...
111721,"Spitzer \n\nUmm, theres no actual article for ..."


In [3]:
target = np.array([[int(item.split(',')[class_label]) for class_label in range(len(item.split(',')))] for item in target])

print(target.shape)

(111723, 6)


In [4]:
# !pip install xgboost
# !pip install lightgbm

# Preprocessing

## train test split

In [5]:
x_train, x_test, y_train, y_test = tts(data, target, test_size=0.3, random_state=42)

# NLP attempt 1 - count vectorizer

## Bagging for count vectorizer

### cleaning before bagging

In [17]:
import re
def cleanString(comment: str) -> str:
    comment = re.sub('n\'t', '', comment)
    comment = re.sub('\'m', '', comment)
    comment = re.sub('\'ve', '', comment)
    # comment = re.sub(' to', '', comment)
    # comment = re.sub('the', '', comment)
    comment = re.sub('\'s', '', comment)
    comment = re.sub(' is', '', comment)
    comment = re.sub(' are', '', comment)
    comment = re.sub(' have', '', comment)
    comment = re.sub(' has', '', comment)
    comment = re.sub(' a', '', comment)
    comment = re.sub(' the', '', comment)


    comment = comment.replace('\n', ' \n ')
    comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    comment = comment.replace(r'[0-9]', '')
    # remove punctuation and numbers
    # comment = re.sub('[^a-zA-Z!?%]', ' ', comment)
    comment = re.sub('[^a-zA-Z%]', ' ', comment)
    # del %
    comment = re.sub('%', '', comment)
    # remove multiple spaces
    comment = re.sub(r' +', ' ', comment)
    # remove newline
    comment = re.sub(r'\n', ' ', comment)
    # remove digits
    # comment = ''.join(i for i in comment if not i.isdigit())
    comment = re.sub(r' +', ' ', comment)
    comment = comment.strip()
    return comment

data.text = data.text.map(cleanString)
data_test.text = data_test.text.map(cleanString)

# Clean data sets
# data['text'] = data['text'].apply(text_clean)
# data_test['text'] = data_test['text'].apply(text_clean)

In [18]:
data

Unnamed: 0,text
0,Explanation Why edits made under my username H...
1,Dww He matches this background colour I seemin...
2,Hey man I really not trying to edit war It jus...
3,More I ca makeny real suggestions on improveme...
4,You sir my hero Any chance you remember what p...
...,...
111718,Auto guidesnd motoring press not good sources ...
111719,please identify what part of BLPpplies because...
111720,Catalan independentism social movement involvi...
111721,Spitzer Umm res noctualrticle for prostitution...


### count vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
cv = CountVectorizer()

X = cv.fit(data['text'])
X = cv.transform(data['text'])
X_test = cv.transform(data_test['text'])
print(X.shape, X_test.shape)

(111723, 266598) (47848, 266598)


### feature reduction by >1000 count filtering

In [8]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit(data['text'])
X = cv.transform(data['text'])
X_test = cv.transform(data_test['text']) #
print(X)
print(X.shape, X_test.shape) #

number_threshold = 500
print(np.sum((np.sum(X, axis=0)>number_threshold)))

list_indices = [index for index, val in enumerate(np.array(np.sum(X, axis=0)).flatten()>number_threshold) if val == 1]
X_reduced_freq_1000_filter = X[:, list_indices]
X_test_reduced_freq_1000_filter = X_test[:, list_indices]

print(X_reduced_freq_1000_filter.shape)
# pca = PCA(n_components=2)
# pca.fit(X)
# print(pca.explained_variance_ratio_)

  (0, 4443)	1
  (0, 34037)	1
  (0, 56088)	1
  (0, 56850)	1
  (0, 61403)	1
  (0, 69575)	1
  (0, 70379)	1
  (0, 71572)	1
  (0, 79438)	1
  (0, 82057)	1
  (0, 90895)	1
  (0, 114699)	1
  (0, 129290)	1
  (0, 136694)	1
  (0, 143966)	1
  (0, 148421)	1
  (0, 152819)	1
  (0, 158054)	1
  (0, 164076)	1
  (0, 171981)	1
  (0, 189412)	1
  (0, 192141)	1
  (0, 192505)	1
  (0, 207590)	1
  (0, 211055)	1
  :	:
  (111721, 44501)	1
  (111721, 76579)	1
  (111721, 150162)	1
  (111721, 179178)	1
  (111721, 190979)	1
  (111721, 193970)	1
  (111721, 214159)	1
  (111721, 240402)	1
  (111722, 4443)	1
  (111722, 49126)	1
  (111722, 74830)	1
  (111722, 110279)	3
  (111722, 124419)	1
  (111722, 127408)	1
  (111722, 127433)	1
  (111722, 152819)	1
  (111722, 158054)	1
  (111722, 181187)	1
  (111722, 213760)	1
  (111722, 227484)	1
  (111722, 233153)	1
  (111722, 247712)	1
  (111722, 251331)	1
  (111722, 255804)	1
  (111722, 263815)	1
(111723, 266598) (47848, 266598)
1309
(111723, 1309)


In [10]:
from sklearn.multiclass import OneVsRestClassifier
import sklearn

model = OneVsRestClassifier(sklearn.svm.LinearSVC(C=0.5))
model.fit(X, target)

target_pred = model.predict(X)
print(np.sum(target_pred == target)/np.prod(target))

print("how many of the scores are equal along all axes", np.sum(target_pred == target)/1)
print(f'{list(target_pred[0])}'.replace(', ', ',').replace('[', '"').replace(']', '"'))
gen_submission(target_pred)


  import sys


inf
how many of the scores are equal along all axes 668822.0
"0,0,0,0,0,0"
            ID     Prediction
0            0  "0,0,0,0,0,0"
1            1  "0,0,0,0,0,0"
2            2  "0,0,0,0,0,0"
3            3  "0,0,0,0,0,0"
4            4  "0,0,0,0,0,0"
...        ...            ...
111718  111718  "0,0,0,0,0,0"
111719  111719  "0,0,0,0,0,0"
111720  111720  "0,0,0,0,0,0"
111721  111721  "0,0,0,0,0,0"
111722  111722  "0,0,0,0,0,0"

[111723 rows x 2 columns]


In [38]:
print(type(X_reduced_freq_1000_filter), type(X))
print(np.shape(X_reduced_freq_1000_filter), np.shape(X))

<class 'scipy.sparse.csr.csr_matrix'> <class 'scipy.sparse.csr.csr_matrix'>
(111723, 803) (111723, 152903)


In [11]:
target_test = model.predict(X_test)
gen_submission(target_test)

          ID     Prediction
0          0  "0,0,0,0,0,0"
1          1  "0,0,0,0,0,0"
2          2  "1,0,0,0,0,0"
3          3  "0,0,0,0,0,0"
4          4  "0,0,0,0,0,0"
...      ...            ...
47843  47843  "0,0,0,0,0,0"
47844  47844  "0,0,0,0,0,0"
47845  47845  "0,0,0,0,0,0"
47846  47846  "0,0,0,0,0,0"
47847  47847  "0,0,0,0,0,0"

[47848 rows x 2 columns]


# sort by most frequent words

In [13]:
from sklearn.multiclass import OneVsRestClassifier
import sklearn

model = OneVsRestClassifier(sklearn.svm.LinearSVC(C=0.5))
import xgboost as xgb
import lightgbm as lgb
# model = OneVsRestClassifier(xgb.XGBClassifier(max_depth=6))
model.fit(X_reduced_freq_1000_filter, target)

target_pred = model.predict(X_reduced_freq_1000_filter)
print(np.sum(target_pred == target)/np.prod(target))

print("how many of the scores are equal along all axes", np.sum(target_pred == target)/1)
print(f'{list(target_pred[0])}'.replace(', ', ',').replace('[', '"').replace(']', '"'))
gen_submission(target_pred)



#####
target_test = model.predict(X_test_reduced_freq_1000_filter)
gen_submission(target_test)

  # This is added back by InteractiveShellApp.init_path()


inf
how many of the scores are equal along all axes 655399.0
"0,0,0,0,0,0"
            ID     Prediction
0            0  "0,0,0,0,0,0"
1            1  "0,0,0,0,0,0"
2            2  "0,0,0,0,0,0"
3            3  "0,0,0,0,0,0"
4            4  "0,0,0,0,0,0"
...        ...            ...
111718  111718  "0,0,0,0,0,0"
111719  111719  "0,0,0,0,0,0"
111720  111720  "0,0,0,0,0,0"
111721  111721  "0,0,0,0,0,0"
111722  111722  "0,0,0,0,0,0"

[111723 rows x 2 columns]
          ID     Prediction
0          0  "0,0,0,0,0,0"
1          1  "0,0,0,0,0,0"
2          2  "0,0,0,0,0,0"
3          3  "0,0,0,0,0,0"
4          4  "0,0,0,0,0,0"
...      ...            ...
47843  47843  "0,0,0,0,0,0"
47844  47844  "0,0,0,0,0,0"
47845  47845  "0,0,0,0,0,0"
47846  47846  "0,0,0,0,0,0"
47847  47847  "0,0,0,0,0,0"

[47848 rows x 2 columns]


# Use tfid vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    stop_words='english',
    sublinear_tf=True
)

X_transformed = word_vectorizer.fit_transform(data['text'])
X_test_transformed = word_vectorizer.transform(data_test['text'])

In [20]:
X_transformed.shape

(111723, 5321926)

In [26]:
from sklearn.multiclass import OneVsRestClassifier
import sklearn
from sklearn.linear_model import LogisticRegression

# model = OneVsRestClassifier(sklearn.svm.LinearSVC(C=0.5))
import xgboost as xgb
import lightgbm as lgb
model = OneVsRestClassifier(LogisticRegression(C = 10, penalty ='l2', solver='liblinear'))
model.fit(X_transformed, target)

target_pred = model.predict(X_transformed)
print(np.sum(target_pred == target)/np.prod(target))

print("how many of the scores are equal along all axes", np.sum(target_pred == target)/1)
print(f'{list(target_pred[0])}'.replace(', ', ',').replace('[', '"').replace(']', '"'))
gen_submission(target_pred)



#####
target_test = model.predict(X_test_transformed)
gen_submission(target_test)

  if sys.path[0] == '':


inf
how many of the scores are equal along all axes 669511.0
"0,0,0,0,0,0"
            ID     Prediction
0            0  "0,0,0,0,0,0"
1            1  "0,0,0,0,0,0"
2            2  "0,0,0,0,0,0"
3            3  "0,0,0,0,0,0"
4            4  "0,0,0,0,0,0"
...        ...            ...
111718  111718  "0,0,0,0,0,0"
111719  111719  "0,0,0,0,0,0"
111720  111720  "0,0,0,0,0,0"
111721  111721  "0,0,0,0,0,0"
111722  111722  "0,0,0,0,0,0"

[111723 rows x 2 columns]
          ID     Prediction
0          0  "0,0,0,0,0,0"
1          1  "0,0,0,0,0,0"
2          2  "0,0,0,0,0,0"
3          3  "0,0,0,0,0,0"
4          4  "0,0,0,0,0,0"
...      ...            ...
47843  47843  "0,0,0,0,0,0"
47844  47844  "0,0,0,0,0,0"
47845  47845  "0,0,0,0,0,0"
47846  47846  "0,0,0,0,0,0"
47847  47847  "0,0,0,0,0,0"

[47848 rows x 2 columns]
