## IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import string
import re
import fasttext as ft
from sklearn.model_selection import train_test_split
from slugify import Slugify
from pprint import pprint
import joblib

In [None]:
df = pd.read_csv("Topic_Labels.csv", dtype=str, low_memory=False, encoding="utf-8")
df['text'] = df.text.astype(str)
df.head()

In [None]:
df.groupby(['topic_Name','topic']).count()

In [None]:
custom_slugify = Slugify(to_lower=True)
df['topic_Name'] = df['topic_Name'].map(custom_slugify)
df.groupby(['topic']).count()

## Declared ordered Topics Names 

In [None]:
target_names_list = ['Health', 'Nutrition', 'Education', 'HIV Aids', 'Violence Against Children (VAC)', 'WASH', 'Mestrual Hygiene', 'Others','Corona','U-Report']

## Pre-Processing steps (Comment as required)

In [None]:
def clean_for_prediction(list_text):
    new_list = []
    typos_df = pd.read_csv("../Typos.csv", dtype=str, low_memory=False, encoding="utf-8")
    slangs_df = pd.read_csv("../Slangs.csv", dtype=str, low_memory=False, encoding="utf-8")
    stopwords_df = pd.read_csv("../Stopwords.csv", dtype=str, low_memory=False, encoding="utf-8")
    #stopwords_df['StopWords'] = " " + stopwords_df['StopWords'] + " "
    stopwords_list = list(stopwords_df['StopWords'])
    
    
    for x in list_text:
        x = re.sub(r'[^\w\s]',' ',x)    # Remove Panctuations /?!.
        x = x.strip()                   # Remove leading and trailing spaces
        x = re.sub(' +', ' ', x)        # Remove extra white spaces
        x = re.sub('[^A-Za-z0-9]+', ' ', x)  # Remove special characters
        x = x.lower()                   # Converts to lower case
        x = ' '+x+' '                   # Makes sure there is a single white space leading and trailing
        for index, row in typos_df.iterrows():
            x = x.replace(" "+str(row['Typo'])+" ", " "+row['Word']+" ")         # Replaces typos
        for index, row in slangs_df.iterrows():
            x = x.replace(" "+row['Slang']+" ", " "+row['Meaning']+" ")          # Replaces slangs
        x = ' '.join([word for word in x.split() if word not in stopwords_list]) # Removes stopwords
        #print(x)
        new_list.append(x)
    return new_list

In [None]:
X = ["hedhi    pedi?virusi! paka sanitaiza/mikono udumavu  /  wa MTOTO'Elim  km 19   ÃƒÂƒÃ‚ÂƒÃƒÂ‚Ã‚Â£ÃƒÂƒÃ‚Â‚ÃƒÂ‚Ã‚Âº xul Kituo ya afya   kinatoa huduma mbaya wajawazito  kwa na mimi "]
docs_new = ['    hedhi    pedi?', 'virusi!','paka sanitaiza/mikono ','udumavu  /  wa MTOTO','Elim  km 19   ÃƒÂƒÃ‚ÂƒÃƒÂ‚Ã‚Â£ÃƒÂƒÃ‚Â‚ÃƒÂ‚Ã‚Âº xul', '   Kituo ya afya   kinatoa huduma mbaya wajawazito  kwa na mimi  ']
docs_new = clean_for_prediction(docs_new)
print(docs_new)

## Split Dataset into TRAIN and TEST sets

In [None]:
def our_train_test_split(df, test_size=.25):
    train = pd.DataFrame(columns=['text','topic'])
    test= pd.DataFrame(columns=['text','topic'])
    df_list = []
    classes = df.topic.unique()
    
    for c in classes:
        df_list.append(df[df.topic==c])
        
    for dfs in df_list:
        X_train, X_test, y_train, y_test = train_test_split(dfs.text, dfs.topic, test_size=test_size, random_state=42, shuffle=True)
        train = train.append(pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train).reindex(pd.DataFrame(X_train).index)], axis=1))
        test = test.append(pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test).reindex(pd.DataFrame(X_test).index)], axis=1))
    
    return train, test


In [None]:
train, test = our_train_test_split(df)

# 2. Stochastic Gradient Disent Classifier

## 2.1 SGDClassifier using step-by-step and TF-IDF-Transformer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
count_vect = CountVectorizer(ngram_range=(1,1)) 

count_vect.fit(df['text'])
X_train_counts = count_vect.transform(train['text'])

X_train_counts.shape

In [None]:
count_vect.vocabulary_.get(u'elimu')

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)

In [None]:
X_test_counts = count_vect.transform(test['text'])
print(X_test_counts.shape)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)

In [None]:
sgd_clf.fit(X_train_tfidf,  train['topic'])
predicted = sgd_clf.predict(X_test_tfidf)
sc = np.mean(predicted == test['topic'])
sc

In [None]:
from sklearn import metrics
print(metrics.classification_report(test['topic'], predicted, target_names=target_names_list))

In [None]:
docs_new = ['hedhi pedi', 'elimu ya afya haiwafikii walengwa','paka sanitaiza mikono ','udumavu wa mtoto','elimu inayotolewa', 'Kituo afya kinatoa huduma mbaya wajawazito','watoto wanatumikishwa kingono']
docs_new = clean_for_prediction(docs_new)
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = sgd_clf.predict(X_new_tfidf)
print(predicted)

## 2.1 SGDClassifier using PIPELINE and TF-IDF Transformer

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

sgd_tfidf_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='modified_huber', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

In [None]:
# TRAINING ALGORITHM- Fit Pipeline
sgd_tfidf_model = sgd_tfidf_pipeline.fit(train['text'], train['topic'])
sgd_tfidf_model

In [None]:
sc = sgd_tfidf_model.score(test['text'], test['topic'] )
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/sgd_tfidf_model.pkl"
joblib.dump(sgd_tfidf_model, joblib_file)

## 2.2 SGDClassifier using PIPELINE and TF-Transformer

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

sgd_tf_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tf', TfidfTransformer(use_idf=False)),
     ('clf', SGDClassifier(loss='modified_huber', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

In [None]:
# TRAINING ALGORITHM- Fit Pipeline
sgd_tf_model = sgd_tf_pipeline.fit(train['text'], train['topic'])
sgd_tf_model

In [None]:
sc = sgd_tf_model.score(test['text'], test['topic'] )
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/sgd_tf_model.pkl"
joblib.dump(sgd_tf_model, joblib_file)

In [None]:
from sklearn import metrics
predicted_test = sgd_tf_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))

In [None]:
new_text = ['hedhi pedi', 'virusi','paka sanitaiza mikono ','udumavu wa mtoto','elimu inayotolewa', 'Kituo afya kinatoa huduma mbaya wajawazito']
new_text_cleaned = clean_for_prediction(new_text)
predicted = sgd_tf_model.predict(new_text_cleaned)
print(predicted)

# 3. Support Vector Machines Classifier(SKLEARN)

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline


svm_pipeline = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', SVC(kernel = 'sigmoid', random_state=0, gamma='scale', C=1.2, probability=True)), #or c=1.3
 ])

In [None]:
svm_model = svm_pipeline.fit(train['text'], train['topic'])
svm_model

In [None]:
sc = svm_model.score(test['text'], test['topic'])
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/SVM_model.pkl"
joblib.dump(svm_model, joblib_file)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm_pipeline,df['text'], df['topic'], cv=5, scoring='f1_weighted')
scores

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2*100))

In [None]:
from sklearn import metrics
predicted_test = svm_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))

In [None]:
new_text = ['hedhi pedi Mwanamke huwa anapata siku zake na damu nyingi hutoka na kumchafua asipotumia pedi wakati wa hedhi']
new_text_cleaned = clean_for_prediction(new_text)
predicted = svm_model.predict(new_text_cleaned)
print(predicted)

In [None]:
def my_pipeline(gamma, C):
    svm_pipeline2 = Pipeline([
         ('vect', CountVectorizer()),
         ('tfidf', TfidfTransformer()),
         ('clf', SVC(kernel='rbf', random_state=0, gamma=gamma, C=C, probability=True)),
     ])
    return svm_pipeline2.fit(train['text'], train['topic'])

gamma_range = np.linspace(0.1, 1, 10)
C_range = np.arange(10, 110, 10)
print(gamma_range)
print(C_range)

In [None]:
g = 0
c = 0
max_score = 0
for gamma in gamma_range:
    for C in C_range:
        svm_model = my_pipeline(gamma, C)
        score = svm_model.score(test['text'], test['topic'])
        print("For gamma = ",gamma, ", C = ", C, ": Score = ", score)
        if score > max_score:
            max_score = score
            c = C
            g = gamma

In [None]:
print("Gamma: ", g)
print("C: ", C)
print("Best Score: ", max_score)

In [None]:
# Random Search with Cross Validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV

C = [round(x,1) for x in np.linspace(0.1, 1.0, 10)]
gamma = ['scale', 'auto']
kernel = ['rbf','poly','linear','sigmoid']
class_weight= ['balanced',None]

# Create the random grid
random_grid = {
               'C': C,
               'gamma': gamma,
               'kernel': kernel,
    'class_weight': class_weight
              }

pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
sv = SVC(random_state = 0)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
sv_random = RandomizedSearchCV(estimator=sv, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 5, verbose=2, random_state=0, n_jobs=-1,
                              return_train_score=True)
# Fit the random search model
sv_random.fit(X_train_tfidf, train['topic'])

In [None]:
sv_random.best_params_

In [None]:
# Evaluation Function
def evaluate(model, test_features, test_labels):
    
    return model.score(test_features, test_labels)

In [None]:
# Evaluate the Default Model
base_model = SVC()
base_model.fit(X_train_tfidf, train['topic'])
base_accuracy = evaluate(base_model, X_test_tfidf, test['topic'])
base_accuracy

In [None]:
# Evaluate the Best Random Search Model
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test_tfidf, test['topic'])
random_accuracy

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
               'C':  [1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
               'gamma': ['scale', 'auto'],
               'kernel': ['rbf','poly','sigmoid'],
               'class_weight': ['balanced',None]
            }

# Create a base model
svm = SVC(random_state = 0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, return_train_score=True)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_tfidf, train['topic'])

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test_tfidf, test['topic'])
grid_accuracy

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# 4. Multinominal Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
multiNB_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB(alpha=1.0, fit_prior=True)), #GaussianNB
 ])

In [None]:
multiNB_model = multiNB_pipeline.fit(train['text'], train['topic'])
multiNB_model

In [None]:
sc = multiNB_model.score(test['text'], test['topic'])
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/Classification/multiNB_model_"+ dt + "_score_"+ str(round(sc*100,2))+".pkl"
joblib.dump(multiNB_model, joblib_file)

In [None]:
from sklearn import metrics
predicted_test = multiNB_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))

In [None]:
new_text = ['hedhi pedi wawekwe karantini',
            'kupata maji safi shida',
            'maambukizo kutoka kwa mama',
            'udumavu wa elimu inayotolewa izingatie', 
            'Kituo afya kinatoa huduma mbaya wajawazito']
new_text_cleaned = clean_for_prediction(new_text)
predicted = svm_model.predict(new_text_cleaned)
print(predicted)

# 5. Decision Tree

In [None]:
from sklearn import tree
from sklearn.pipeline import Pipeline

decisionTree_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', tree.DecisionTreeClassifier()),
 ])

In [None]:
decisionTree_model = decisionTree_pipeline.fit(train['text'], train['topic'])
decisionTree_model

In [None]:
sc = decisionTree_model.score(test['text'], test['topic'])
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/decisionTree_model.pkl"
joblib.dump(decisionTree_model, joblib_file)

In [None]:
from sklearn import metrics
predicted_test = decisionTree_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))

In [None]:
new_text = ['hedhi pedi', 
            'hatua kujikinga homa mapafu kunawa mikono',
            'udumavu wa mtoto','elimu inayotolewa izingatie', 
            'Kituo afya kinatoa huduma mbaya wajawazito',
            'Magonjwa yatokanayo na ngono isiyo salama na maambukizo mengine yanaweza pia kuongeza uwezekano wa maambukizo']
new_text_cleaned = clean_for_prediction(new_text)
predicted = svm_model.predict(new_text_cleaned)
print(predicted)

# 6. Forests of randomized trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

randomForest_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', RandomForestClassifier(random_state = 0, n_jobs=-1, criterion= 'gini',
 max_depth= None,
 max_features= 'auto',
 min_samples_leaf= 1,
 min_samples_split= 5,
 n_estimators= 1000
                                   )),
 ])

In [None]:
randomForest_model = randomForest_pipeline.fit(train['text'], train['topic'])
randomForest_model

In [None]:
sc = randomForest_model.score(test['text'], test['topic'])
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/RandomForest_model.pkl.z"
joblib.dump(randomForest_model, joblib_file, compress=9)

In [None]:
from sklearn import metrics
predicted_test = randomForest_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))

In [None]:
new_text = ['hedhi pedi', 
            'hatua kujikinga homa mapafu kunawa mikono',
            'udumavu wa mtoto','elimu inayotolewa izingatie', 
            'Kituo afya kinatoa huduma mbaya wajawazito',
            'Magonjwa yatokanayo na ngono isiyo salama na maambukizo mengine yanaweza pia kuongeza uwezekano wa maambukizo']
new_text_cleaned = clean_for_prediction(new_text)
predicted = randomForest_model.predict(new_text_cleaned)
print(predicted)

In [None]:
# Random Search with Cross Validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(0, 1500, 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
criterion= ['gini', 'entropy']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'criterion':criterion}

pprint(random_grid)

In [None]:
rf = RandomForestClassifier(random_state = 0, n_jobs=-1)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 5, verbose=2, random_state=0, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train_tfidf, train['topic'])

In [None]:
rf_random.best_params_

In [None]:
# Evaluation Function
def evaluate(model, test_features, test_labels):
    
    return model.score(test_features, test_labels)

In [None]:
# Evaluate the Default Model
base_model = RandomForestClassifier()
base_model.fit(X_train_tfidf, train['topic'])
base_accuracy = evaluate(base_model, X_test_tfidf, test['topic'])
base_accuracy

In [None]:
# Evaluate the Best Random Search Model
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test_tfidf, test['topic'])
random_accuracy

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

### 6.2 Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
# {'criterion': 'gini',
#  'max_depth': None,
#  'max_features': 'auto',
#  'min_samples_leaf': 1,
#  'min_samples_split': 5,
#  'n_estimators': 1000}

param_grid = {    
     'n_estimators':[985, 990, 1000],
     'min_samples_split': [2,4, 5, 6],
     'min_samples_leaf': [1],
     'max_features': ['auto'],
     'max_depth': [None],
     'criterion': ['gini'] # 'entropy'
}

# Create a base model
rf = RandomForestClassifier(random_state = 0, n_jobs=-1)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, return_train_score=True)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_tfidf, train['topic']);

In [None]:
grid_search.best_params_

In [None]:
# Evaluate the Best Model from Grid Search
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test_tfidf, test['topic'])
grid_accuracy

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# 7. Nearest Neighbors Classification

In [None]:
k = len(target_names_list)+1

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
best_k = k+7
knn_pipeline = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', KNeighborsClassifier(best_k, weights='distance')),
 ])

In [None]:
knn_model = knn_pipeline.fit(train['text'], train['topic'])
knn_model

In [None]:
sc = knn_model.score(test['text'], test['topic'])
sc

In [None]:
# Save to file in the current working directory
joblib_file = "Models/Classification/knn_model_"+ dt + "_score_"+ str(round(sc*100,2))+".pkl"
joblib.dump(knn_model, joblib_file)

In [None]:
from sklearn import metrics
predicted_test = knn_model.predict(test['text'])
print(metrics.classification_report(test['topic'], predicted_test,target_names=target_names_list))