# Importing Libraies and Requirement

In [1]:
import time

In [2]:
start = time.time()

In [3]:
import pandas as pd
import numpy as np
import sklearn as sk

In [4]:
#Version of Library 

for i in [pd, np, sk]:
    print(i.__name__, i.__version__)

pandas 1.3.4
numpy 1.20.3
sklearn 0.24.2


In [5]:
def loading_data():
    print('Loading Dataset\n')
    train = pd.read_csv('train.csv', delimiter=',')
    test = pd.read_csv('test.csv', delimiter=',')
    sub = pd.read_csv('sample_submission.csv', delimiter=',')
    print('Dataset Loaded')
    return train, test, sub 

In [6]:
train, test, sub  = loading_data()

Loading Dataset

Dataset Loaded


In [7]:
train['id'].nunique() - train.shape[0]

0

In [8]:
train.head()

Unnamed: 0,id,NewsText,label
0,1553,Sannan ya ringa yin fina-finai a kan matsalol...,1
1,1059,Karanta cikakken labarin a nan,0
2,1006,Tuni dai Saudiyya ta ce za ta kaddamar da aiki...,1
3,1242,Wannan zai hana raguwar darajarsa kamar yadda ...,-1
4,1221,"Ba shi da wata iyaka, ma'ana iya shafukan da z...",0


In [9]:
def transform(data, details):
    # 0 == -1 , 1 == 0, 2 == 1
    """
    Data: Dataframe
    details: To either encode or decode
    
    To decode or encode the Label column
    """
    x = []
    if details == 'encode':
        for v, i in enumerate(data):
            if i == -1: x.append(0)
            elif i == 0: x.append(1)
            else: x.append(2)
    else:
        for v, i in enumerate(data):
            if i == 0: x.append(-1)
            elif i == 1: x.append(0)
            else: x.append(1)
                           
    return np.array(x)

In [10]:
#for cleaning the newstext colum
def data_remove(df, remove, add):
    """
    DF : Dataframe or Series
    Remove: The String Value to remove
    Add: The Value to replace with 
    """
    x_list = []
    
    for i, df_str in enumerate(df):
        if remove in df_str:
            new = df_str.replace(remove, add)
            x_list.append(new)
            #print(f'index:{i} Old Data: {df_str}')
            #print(f'New Data: {new}')
        else:
            x_list.append(df_str)
            
    return x_list

In [11]:
#train['NewsText'] = data_remove(train['NewsText'],remove = '₦', add = '' )
#train['NewsText'] = data_remove(train['NewsText'],remove = ')', add = '' )
#train['NewsText'] = data_remove(train['NewsText'],remove = '(', add = '' )

In [12]:
#test['NewsText'] = data_remove(test['NewsText'],remove = '₦', add = '' )
#test['NewsText'] = data_remove(test['NewsText'],remove = ')', add = '' )
#test['NewsText'] = data_remove(test['NewsText'],remove = '(', add = '' )

In [13]:
X = train['NewsText']
y = transform(train['label'].values, details = 'encode')

# Modelling

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [15]:
#The Parameter for the Classifier was after parameter tuning
cv = CountVectorizer()
skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
str_kf = StratifiedKFold(n_splits=5) 
rfc = RandomForestClassifier()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, test_size=0.2, random_state=42)

In [17]:
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [18]:
rfc_1 = RandomForestClassifier(criterion='entropy', max_depth=6, max_features='sqrt',min_samples_split=5, n_estimators=300, random_state = 42)

rfc_2 = RandomForestClassifier(max_depth=8, max_features='sqrt', min_samples_leaf=2,min_samples_split=10, n_estimators = 600, random_state = 42)

rfc_3 = RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf=2,min_samples_split=5, n_estimators=500, random_state = 42)

In [19]:
def model_building(model):   
    cv_score=[]
    test_pred=[]
    train_score=[]

    sub_pred = []

    fold= skf
    X = pd.DataFrame(X_train)
    yy = y_train
    Xtest = pd.DataFrame(X_test)
    yytest = y_test

    ttest = pd.DataFrame(cv.transform(test['NewsText']).toarray())

    for train_index, test_index in fold.split(X,pd.Series(yy)):
        X_trainx, X_val = X.iloc[train_index], X.iloc[test_index]
        y_trainx, y_val = pd.Series(yy).iloc[train_index], pd.Series(yy).iloc[test_index]

        model.fit(X_trainx,y_trainx)
        train_preds = model.predict(X_trainx)
        print("Train Accuracy: ",accuracy_score(y_trainx,train_preds))
        train_score.append(accuracy_score(y_trainx,train_preds))

        val_preds = model.predict(X_val)
        print("Validation Accuracy: ",accuracy_score(y_val,val_preds))
        cv_score.append(accuracy_score(y_val,val_preds))

        test_p = model.predict(Xtest)
        print("Test Accuracy Score : ",accuracy_score(yytest ,test_p))
        print("______________\n")
        test_pred.append(accuracy_score(yytest ,test_p))

        tested = model.predict_proba(ttest)
        print("Test has been Scored")
        print("______________\n")
        sub_pred.append(tested)

    print(f'Training Mean Accuracy Score {np.mean(train_score)}')
    print(f'Validation Mean Accuracy Score {np.mean(cv_score)}')
    print(f'Test Mean AccuracyScore {np.mean(test_pred)}')
    
    return (np.sum(sub_pred, axis = 0))/15

In [20]:
value_1 = model_building(model = rfc_1)
value_2 = model_building(model = rfc_2)
value_3 = model_building(model = rfc_3)

Train Accuracy:  0.8562091503267973
Validation Accuracy:  0.5897435897435898
Test Accuracy Score :  0.4791666666666667
______________

Test has been Scored
______________

Train Accuracy:  0.8562091503267973
Validation Accuracy:  0.6410256410256411
Test Accuracy Score :  0.5
______________

Test has been Scored
______________

Train Accuracy:  0.8246753246753247
Validation Accuracy:  0.3684210526315789
Test Accuracy Score :  0.5208333333333334
______________

Test has been Scored
______________

Train Accuracy:  0.8246753246753247
Validation Accuracy:  0.5263157894736842
Test Accuracy Score :  0.4791666666666667
______________

Test has been Scored
______________

Train Accuracy:  0.8506493506493507
Validation Accuracy:  0.6842105263157895
Test Accuracy Score :  0.5
______________

Test has been Scored
______________

Train Accuracy:  0.8169934640522876
Validation Accuracy:  0.5384615384615384
Test Accuracy Score :  0.5833333333333334
______________

Test has been Scored
______________

In [21]:
new_val = value_1 *0.15 + value_2*0.35 + value_3*0.5

In [22]:
new_val = np.argmax(new_val, axis = 1)

In [23]:
prediction = transform(new_val, details = 'decode')

In [24]:
pd.Series(prediction).value_counts()

 1    34
-1    24
 0     2
dtype: int64

In [25]:
sub['label'] = prediction
sub.to_csv('1st_place_solution.csv', index = False)

In [26]:
end = time.time()

In [27]:
print('Time to Run the whole Notebook:', end-start)

Time to Run the whole Notebook: 57.23318791389465
