## Project Objective: AUTOMATIC TICKET ASSIGNMENT

In [2]:
# !pip install openpyxl
# !pip install tabulate
# !pip install langdetect
# !pip install flake8 pycodestyle_magic
# !pip install textblob
# !pip install pycountry
# !pip install spacy_cld
# !pip install fasttext
# !pip install fastlangid
# !pip install plotly
# !pip install chart_studio
# %reset
# !pip install wordcloud
#!pip install wordcloud
# !pip install lightgbm
# !pip install -U gensim
# !pip install yellowbrick

### Process Flow
1. Import Data
2. Explore Data, check for any inconsitancy
3. Remove redundant data from short Description and Description.
4. detect language of description.
5. EDA and Visulization
7. Drop duplicates based on shot description , description and assignment group
8. tokenize data and remove stop words.
9. create corpus for Description and combined Description.



In [3]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [4]:
##################################################
# File: Autometic Ticket Assignment
# Author: Manasi Hiremath
# Date: 31-Aug-21
# Purpose: NLP based Ticket Assignment
# Input : Input.xlsx 
# Output:Predictive model to automate ticket assignment
# Total Execution Time:495.24 second(s)
##################################################

In [5]:
# Check PEP8 Standards
# %reload_ext pycodestyle_magic
# %pycodestyle_on

In [6]:
# ===================== Step 1: Import library =================
import pandas as pd
import numpy as np
import os
import openpyxl
import re
import time
from langdetect import detect
from textblob import TextBlob
import pytz
from fastlangid.langid import LID
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
import logging
import nltk
import string
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
nltk.download('stopwords')
nltk.download('punkt')
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
#===================== Step 2: Logging =================
# logging.basicConfig(filename='NLP_Capstone_Project.log', level=logging.DEBUG,
#                     format='%(process)d| %(asctime)s | %(name)s | %(levelname)s | %(message)s')
# logging.warning('This will get logged to a file')
# logging.info('This will get logged to a file')

In [8]:
# ===================== Step 3: Processing =================
def func_read_data_n_explore(path):
    '''
    This Function takes input excel file, converts it into pandas dataframe and carry out intial data exploration
    input parameter: input file path
    output parameter: Summary of Data exploration and output pandas dataframe
    '''
    temp_01 = pd.read_excel(path, engine='openpyxl')
    data_with_missing_obs = [x for x in temp_01.columns if len(temp_01[temp_01[x].isnull()]) >= 1]
    print('Data Summary')
    print('')
    print('1.Shape of Data:{}'.format(temp_01.shape))
    print('')
    print('2.Display if column has null values:{}'.format(data_with_missing_obs))
    print('')
    print('3.Display count of missing data :{}'.format(temp_01.isnull().sum()))
    print('')
    for i in range(len(data_with_missing_obs)):
        print("4.{} Sample of missing column:{}".format(i, data_with_missing_obs[i]))
        print('****************')
        temp_02 = temp_01[temp_01[data_with_missing_obs[i]].isnull()]
        print(temp_02.head())
        print('')
    return temp_01


def func_remov_special_char(text):
    '''
    Remove Special Characters from text
    input: text. i.e dataframe column values
    output: clean text
    '''
    try:
        return re.sub(r"[^a-zA-Z]", " ", text)
    except Exception:
        return ''

def func_remov_disclaimer(text):
    '''
    Remove Disclaimer from text
    input: text. i.e dataframe column values
    output: clean text
    '''
    try:
        return re.sub(r"select the following link to view the disclaimer in an alternate language. ","", text)
    except Exception:
        return ''


def func_clean_data(temp_02):
    # Remove new line characters
    temp_02['Description'] = temp_02['Description'].apply(func_remov_disclaimer)
    temp_02['Short description'] = temp_02['Description'].apply(func_remov_disclaimer)    
    remove_char_list = ['_x000D_\n', '_x000D_\n_x000D_\n','received from:', r"\S*@\S*\s", '\n']
    temp_02['Description'] = temp_02['Description'].replace(remove_char_list, '', regex=True)
    temp_02['Short description'] = temp_02['Short description'].replace(remove_char_list, '', regex=True)
    temp_02['Short description'] = temp_02['Short description'].apply(func_remov_special_char)
    temp_02['Description'] = temp_02['Description'].apply(func_remov_special_char)
    # Remove caller name from description
    temp_03 = pd.DataFrame(temp_02.Caller.str.split(' ', 2).tolist(),columns=['FirstName', 'LastName'])
    temp_03.update(temp_03[['FirstName', 'LastName']].applymap('{}'.format))
    for name in range(len(temp_03)):
        temp_02['Description'] = temp_02['Description'].str.replace(temp_03['FirstName'][name], '')
        temp_02['Description'] = temp_02['Description'].str.replace(temp_03['LastName'][name], '')
        temp_02['Short description'] = temp_02['Short description'].str.replace(temp_03['FirstName'][name], '')
        temp_02['Short description'] = temp_02['Short description'].str.replace(temp_03['LastName'][name], '')
    # strip white space
    temp_02[temp_02.columns] = temp_02.apply(lambda x: x.str.strip())
    #remove extra space between words
    temp_02['Description'] = [(" ".join(temp_02['Description'][x].split())) for x in range(len(temp_02))]
    temp_02['Short description'] = [(" ".join(temp_02['Short description'][x].split())) for x in range(len(temp_02))]
    return temp_02


# Funtion to detect text language
def func_detect_text_language(text):
    '''
    This function is used to detect the language of the text
    input parameter: text
    Output value": language of the text
    '''
    try:
        return detect(text)
    except Exception:
        return 'unknown'


def func_vis01_target_analysis(temp05):
    '''
    This Function creates visualization.
    input: processed input data
    output visulizations
    '''
    # Assignment Column analysis
    temp06 = temp05.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
    temp06 = temp06.sort_values(by='Total_Count', axis=0, ascending=False)
    temp06['Ticket_Contribution'] = round((temp06['Total_Count']/temp06['Total_Count'].sum())*100, 2)
    # Check Contirbution of each assignemt group
    fig = px.bar(temp06, x='Assignment group', y='Ticket_Contribution', title="Ticket Contribution by Assignment Group",
                 hover_data=['Assignment group', 'Total_Count', 'Ticket_Contribution'])
    fig.show()
    
    prod_list_with_small_data=temp06[temp06['Total_Count'] <2 ][['Assignment group', 'Total_Count']]
    print(prod_list_with_small_data)
    
    temp06 = temp05.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
    temp06 = temp06.sort_values(by='Total_Count', axis=0, ascending=False)
    temp06['Ticket_Contribution'] = round((temp06['Total_Count']/temp06['Total_Count'].sum())*100, 2)
    # Check Contirbution of each assignemt group
    fig = px.bar(temp06, x='Assignment group', y='Ticket_Contribution', title="Ticket Contribution by Assignment Group",
                 hover_data=['Assignment group', 'Total_Count', 'Ticket_Contribution'])
    fig.show()
    
    # Language analysis
    temp07 = temp05.groupby(['Language_name'])['Language_name'].count().to_frame('Total_Count').reset_index()
    temp07 = temp07.sort_values(by='Total_Count', axis=0, ascending=False)
    temp07['Language_used'] = round((temp07['Total_Count']/temp07['Total_Count'].sum())*100, 2)
    # Check Contirbution of each assignemt group
    fig = px.bar(temp07, x='Language_name', y='Language_used', title="Distribution of Languages used by caller",
                 hover_data=['Language_name', 'Total_Count', 'Language_used'])
    fig.show()
    # Languages contribution within assingment Group
    temp08 = temp05.groupby(['Assignment group','Language_name'])['Assignment group'].count().to_frame('Total_Count').reset_index()
    temp08 = temp08.sort_values(by='Total_Count', axis=0, ascending=False)
    temp08['Ticket_Contribution'] = round((temp08['Total_Count']/temp08['Total_Count'].sum())*100, 2)
    fig = px.bar(temp08, x='Assignment group', y='Ticket_Contribution', title="Languages used in Assignment Group",
                 hover_data=['Assignment group', 'Total_Count', 'Ticket_Contribution'], color='Language_name')
    fig.show()
    # Checking length of ticket short descriptions
    result = [len(x) for x in temp05['Short description']]
    print('Sample Length of ticket Short Description:',result[0:10])
    print('Maximum ticket Description length:',np.max(result))
    print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
    fig = px.box(y=result,title="Length Distribution of Short description of ticket")
    fig.show()
    
    # Checking length of ticket descriptions
    result = [len(x) for x in temp05['Description']]
    print('Sample Length of ticket Short Description:',result[0:10])
    print('Maximum ticket Description length:',np.max(result))
    print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
    fig = px.box(y=result,title="Length Distribution of ticket Description")
    fig.show()
    
    # Checking length of combined description
    temp_05.loc[:,'combined_description']=temp_05['Short description'].astype(str)+' '+temp_05['Description'].astype(str)
    result = [len(x) for x in temp05['combined_description']]
    print('Sample Length of ticket Short Description:',result[0:10])
    print('Maximum ticket Description length:',np.max(result))
    print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
    fig = px.box(y=result,title="Length Distribution of Short and long description")
    fig.show()

    return None


stop_words=nltk.corpus.stopwords.words('english')+ list(string.punctuation)
def func_doc_preprocess(doc):
    '''
    This function tokenize text data.
    Removes stop words from text.
    input parameter: doc: text 
    output parameter: corpus of token
    '''
    doc = re.sub(r'[^a-zA-Z\s^\D]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    ## tokenize the document
    tokens= nltk.word_tokenize(doc)
    #remove stop words
    wordList = [token for token in tokens if token not in stop_words and not token.isdigit()]
    # re-create document from filtered tokens
    doc = ' '.join(wordList)
    return doc

def func_feature_importance_by_class(data,corpus):
    '''
    This function proviesd important features for each assignement group.
    input parameter: data: clean input data.
                     Corpus:tokenize data
    ouput: provides important features for each traget class.
    '''
    dummy_vars=pd.get_dummies(data['Assignment group'])
    data_10=pd.concat([data,dummy_vars], axis=1)
    
    # tf_idf
    vectorizer = TfidfVectorizer(stop_words ='english', 
                              ngram_range = (1,3), 
                              max_df = .6, min_df = .01,
                              sublinear_tf=True)
    X = vectorizer.fit_transform(corpus)
    X=X.toarray()
    feature_names = vectorizer.get_feature_names()
    tfidf = pd.DataFrame(np.round(X, 3), columns=feature_names)
    
    model = ExtraTreesClassifier()
    feat_importances_df=pd.DataFrame()
    # create a var name
    for i in range(len(data['Assignment group'].unique())):
        var='GRP'+'_'+str(i)
        labels=data_10[var]
        model.fit(tfidf, labels)
        feat_importances = pd.DataFrame(model.feature_importances_,
                                    index=vectorizer.get_feature_names(),
                                    columns=['feature_importances'])
        feat_importances.reset_index(inplace=True)
        feat_importances.rename(columns={"index": "features"})
        feat_importances['Target_class']=var
        feat_importances1=feat_importances.sort_values(by=['Target_class','feature_importances'], ascending=False).head(10)
        print("\n==> " ,var)
        print("  * Most Correlated terms are: %s" %(', '.join(feat_importances1['index'])))

    
    return tfidf

def train_test_fit(model, X_train, X_test, y_train, y_test): 
    '''
    This function fits the given model and score its performance
    input parameter:model : specify classification algorithm for experiment
                    X_train : Feature train data
                    X_test : Feature test data
                    y_train : Labels train data
                    y_test : Labels test data
    Output parameter:y_pred: predicted lables
                     score: model score for test data
                     recall
                     precision
                     f1_score
                     duration    
    '''
    start = time.time()
    model.fit(X_train, y_train)   # fit the model with the train data
    y_pred = model.predict(X_test)  # make predictions on the test set
    score = round(model.score(X_test, y_test), 3)   # compute accuracy score for test set
    
    recall=metrics.recall_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred))
    precision=metrics.precision_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred))
    f1_score= round(metrics.f1_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred)),3)
    end = time.time()
    duration = end - start  # calculate the total duration
    
    return y_pred,score, recall, precision,f1_score,duration   # return all the metrics


def model_training(classfierdict, X_train, X_test, y_train, y_test,iteration_det):
    '''
    This function creates a table with scoring metric
    input parameter:classfierdict : specify classification algorithm for experiment
                    X_train : Feature train data
                    X_test : Feature test data
                    y_train : Labels train data
                    y_test : Labels test data
    Output parameter:scoring_metric: scoring metric table
    '''
    scoring_metric=pd.DataFrame()
    scoring_metric_fl=pd.DataFrame()
    model_name_list= []
    score_list = []
    recall_list=[]
    precision_list=[]
    f1_score_list=[]
    elapsed=[]
    y_pred_list=[]
    cm=[]
    
    for i in classfierdict['model']:
        y_pred,score, recall, precision,f1_score,duration=train_test_fit(i,X_train, X_test, y_train, y_test)
        y_pred_list.append(y_pred)
        model_name_list.append(i)
        score_list.append(score)
        recall_list.append(recall)
        precision_list.append(precision)
        f1_score_list.append(f1_score)
        elapsed.append(duration)
        Model_name='model'+'_'+re.sub("[\(\[].*?[\)\]]", "", str(i))        
        
    scoring_metric = pd.DataFrame({'iteration': iteration_det,
                                   'Model':model_name_list,
                                   'Model_score':score_list,
                                   'Model_recall':recall_list,
                                   'Model_precision':precision_list,
                                   'Model_f1_score':f1_score_list,
#                                    'Model_roc_auc':roc_auc_list,
                                   'Elapsed': elapsed})
    scoring_metric['Model_name']= scoring_metric['Model'].astype('str').map(lambda x:re.sub("[\(\[].*?[\)\]]", "", x))   
    return scoring_metric  # return all the metrics


def func_tfidx_metric(corpus,model_data):
    '''
    This function creates a tfidx matrix
    input parameter:corpus : corpus 
                    model_data : analysis data
    Output parameter:scoring_metric: scoring metric table
    '''
    vectorizer = TfidfVectorizer(stop_words ='english', 
                              ngram_range = (1,3), 
                              max_df = .6, min_df = .01,
                              sublinear_tf=True)
    X = vectorizer.fit_transform(corpus)
    X=X.toarray()
    feature_names = vectorizer.get_feature_names()
    tfidf = pd.DataFrame(np.round(X, 3), columns=feature_names)
    X_tr, X_te, y_tr, y_te = train_test_split(
                                            tfidf, model_data['Assignment group'],
                                            test_size=0.1, random_state=10, 
                                             stratify=model_data['Assignment group'])
    return X_tr, X_te, y_tr, y_te


def func_feature_selection_pca(X_tr, X_te):
    '''
    This function selects features based on pca
    input parameter: X_tr : train features 
                     X_te : test features 
    Output parameter:X_tr_pca:selected train features
                     X_te_pca:selected test features
    '''    
    pca = PCA(0.95)
    pca.fit(X_tr)
    X_tr_pca = pca.transform(X_tr)
    X_te_pca = pca.transform(X_te)
    return X_tr_pca, X_te_pca


def func_feature_selection_chi(X_tr, y_tr, X_te):
    '''
    This function selects features using chisquare
    input parameter: X_tr : train features 
                     X_te : test features
                     y_tr : train label
    Output parameter:X_tr_chi2:selected train features
                     X_te_chi2:selected test features
    ''' 
    ch2 = SelectKBest(chi2, k=100)
    X_tr_chi2 = ch2.fit_transform(X_tr,y_tr)
    X_te_chi2 = ch2.transform(X_te)
    return X_tr_chi2, X_te_chi2

In [9]:
start = time.perf_counter()
print('Step 1. Getting Input Data Information...')
temp_03 = func_read_data_n_explore("input_data.xlsx")

Step 1. Getting Input Data Information...
Data Summary

1.Shape of Data:(8500, 4)

2.Display if column has null values:['Short description', 'Description']

3.Display count of missing data :Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64

4.0 Sample of missing column:Short description
****************
     Short description                                        Description  \
2604               NaN  _x000D_\n_x000D_\nreceived from: ohdrnswl.rezu...   
3383               NaN  _x000D_\n-connected to the user system using t...   
3906               NaN  -user unable  tologin to vpn._x000D_\n-connect...   
3910               NaN  -user unable  tologin to vpn._x000D_\n-connect...   
3915               NaN  -user unable  tologin to vpn._x000D_\n-connect...   

                 Caller Assignment group  
2604  ohdrnswl rezuibdt           GRP_34  
3383  qftpazns fxpnytmk            GRP_0  
3906  awpcmsey ctdiuqwe            GRP_0  
3910 

In [10]:
print("Step 2. Data Cleaning has started...")
temp_04 = func_clean_data(temp_03)
print("Step 3. Detecting text language...")
temp_04['text_lang'] = temp_04['Description'].apply(func_detect_text_language)
# Get language name
lang = pd.read_csv('language-codes_csv.csv')
temp_05 = pd.merge(temp_04, lang, left_on='text_lang', right_on='alpha2', how='left') 
print("Step 4. Data Explorarion...")
print('')
print('Data has - {} - ticket assignment groups'.format(len(temp_03['Assignment group'].unique())))
print('')
print('Tickets are registered in - {} languages '.format(len(temp_04['text_lang'].unique()))) 
print('')
func_vis01_target_analysis(temp_05)
print('Step 4.1. NLP preprocessing')

Step 2. Data Cleaning has started...


KeyboardInterrupt: 

In [None]:
temp_05.loc[:,'combined_description']=temp_05['Short description'].astype(str)+' '+temp_05['Description'].astype(str)
print(temp_05.shape)
temp_06=temp_05.drop_duplicates(['Short description','Description','Assignment group'])
print(temp_06.shape)

In [None]:
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_06['Description']
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus_model)

from sklearn.manifold import TSNE
import gc
gc.collect()
# tfIdfMatrix = tfIdfMat.todense()
labels = temp_06['Assignment group'].tolist()
tsne_results = TSNE(n_components=2,init='random',random_state=0, perplexity=40).fit_transform(X)
plt.figure(figsize=(16,10))
# palette = sns.hls_palette(21, l=.6, s=.9)
sns.scatterplot(
    x=tsne_results[:,0], y=tsne_results[:,1],
    hue=labels,
#     palette= palette,
    legend="full",
    alpha=0.3
)
plt.show()

In [None]:
# sns.scatterplot(
#     x=tsne_results[:,0], y=tsne_results[:,1],
#     hue=labels,
# #     palette= palette,
#     legend="full",
#     alpha=0.3
# )

fig = px.scatter(x=tsne_results[:,0], y=tsne_results[:,1], color=labels)
fig.show()

In [None]:
tsne_results.shape

In [None]:
temp_11= temp_06[temp_06['Assignment group'] != 'GRP_0']
# temp_11=temp_06.copy()
temp_11.head(2)

#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['Description']
vectorizer = TfidfVectorizer(stop_words='english')
X_sub = vectorizer.fit_transform(corpus_model)

clusters = KMeans(4, n_init = 30, algorithm='auto')
clusters.fit(X_sub)
x_pred_2=clusters.predict(X_sub)


# # print(score)
data=temp_11.copy()
data['clust_group']=x_pred_2

from sklearn.manifold import TSNE
import gc
gc.collect()
# tfIdfMatrix = tfIdfMat.todense()
labels = data['clust_group'].tolist()
tsne_results = TSNE(n_components=2,init='random',random_state=0, perplexity=40).fit_transform(X_sub)

In [None]:
fig = px.scatter(x=tsne_results[:,0], y=tsne_results[:,1], color=labels)
fig.show()

In [None]:
data['clust_group'].value_counts()

In [None]:
temp_12= temp_06[temp_06['Assignment group'] == 'GRP_0']
temp_12['clust_group']=4
# temp_12

In [None]:
temp_13= pd.concat([temp_12,data], axis=0)
temp_13['clust_group'].value_counts()

In [None]:
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_13['Description']
vectorizer = TfidfVectorizer(stop_words='english')
X_sub = vectorizer.fit_transform(corpus_model)

# clusters = KMeans(4, n_init = 30, algorithm='auto')
# clusters.fit(X_sub)
# x_pred_2=clusters.predict(X_sub)


# # # print(score)
# data1=temp_13.copy()
# data1['clust_group']=x_pred_2

from sklearn.manifold import TSNE
import gc
gc.collect()
# tfIdfMatrix = tfIdfMat.todense()
labels = temp_13['clust_group'].astype('category').to_list()
tsne_results = TSNE(n_components=2,init='random',random_state=0, perplexity=40).fit_transform(X_sub)

In [None]:
fig = px.scatter(x=tsne_results[:,0], y=tsne_results[:,1], color=labels)
fig.show()

In [None]:
temp_13['clust_group'].value_counts()

In [None]:
cluster_list=[0,1,2,3,4]
temp_14 = temp_13[temp_13['clust_group'].isin(cluster_list)]
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_14['Description']
vectorizer = TfidfVectorizer(stop_words='english')
X_sub = vectorizer.fit_transform(corpus_model)

vectorizer = TfidfVectorizer(stop_words ='english', 
                              ngram_range = (1,3), 
                              max_df = .6, min_df = .01,
                              sublinear_tf=True)
X = vectorizer.fit_transform(corpus_model)
X=X.toarray()
feature_names = vectorizer.get_feature_names()
tfidf = pd.DataFrame(np.round(X, 3), columns=feature_names)
X_tr, X_te, y_tr, y_te = train_test_split(
                                            tfidf, temp_14['clust_group'],
                                            test_size=0.1, random_state=10, 
                                             stratify=temp_14['clust_group'])
classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
classfierdict={'model': [
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_pca, X_te_pca, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)
classfierdict={'model': [
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_chi2, X_te_chi2, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=4,assign_labels='discretize',random_state=0).fit(X)

In [None]:
temp_11= temp_06[temp_06['Assignment group'] == 'GRP_0']
# temp_11=temp_06.copy()
temp_11.head(2)

#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['Description']
vectorizer = TfidfVectorizer(stop_words='english')
X_sub = vectorizer.fit_transform(corpus_model)

clustering = SpectralClustering(n_clusters=5,assign_labels='discretize',random_state=0).fit(X_sub)
x_pred_2=clustering.labels_

# # print(score)
data=temp_11.copy()
data['clust_group']=x_pred_2

In [None]:
data['clust_group'].value_counts()

In [None]:
from sklearn.manifold import TSNE
import gc
gc.collect()
# tfIdfMatrix = tfIdfMat.todense()
labels = data['clust_group'].astype('category').to_list()
tsne_results = TSNE(n_components=2,init='random',random_state=0, perplexity=40).fit_transform(X_sub)

In [None]:
fig = px.scatter(x=tsne_results[:,0], y=tsne_results[:,1], color=labels)
fig.show()

In [None]:
vectorizer = TfidfVectorizer(stop_words ='english', 
                              ngram_range = (1,3), 
                              max_df = .6, min_df = .01,
                              sublinear_tf=True)
X = vectorizer.fit_transform(corpus_model)
X=X.toarray()
feature_names = vectorizer.get_feature_names()
tfidf = pd.DataFrame(np.round(X, 3), columns=feature_names)
X_tr, X_te, y_tr, y_te = train_test_split(
                                            tfidf, data['clust_group'],
                                            test_size=0.1, random_state=10, 
                                             stratify=data['clust_group'])
classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr, X_te, y_tr, y_te,'tfidf')
data_1

In [None]:
# Get Total count by assignment group
temp_11= temp_06[temp_06['Assignment group'] != 'GRP_0']
temp_11.head(2)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


# true_k = 10
# model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
# model.fit(X)

# print("Top terms per cluster:")
# order_centroids = model.cluster_centers_.argsort()[:, ::-1]
# terms = vectorizer.get_feature_names()
# for i in range(true_k):
#     print("Cluster %d:" % i),
#     for ind in order_centroids[i, :10]:
#         print(' %s' % terms[ind]),
#     print

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
cluster_range = range( 2, 10 )

fig, ax = plt.subplots(4, 2, figsize=(15,8))
for i in cluster_range:
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(X)

In [None]:
print('Step 4.1. NLP preprocessing')
# get stopwords in english
normalize_corpus = np.vectorize(func_doc_preprocess)
# create corpus
corpus_01 = []
corpus_01 = temp_06['Description']
corpus_01 = np.array(corpus_01)
print("Sample Original Corpus:", corpus_01[10])    
norm_corpus_01 = normalize_corpus(corpus_01)
print("Clean Corpus:", norm_corpus_01[10])
    
# Second corpus
corpus_02 = []
corpus_02 = temp_06['combined_description']
corpus_02 = np.array(corpus_02)
print("Sample Original Corpus:", corpus_02[10])    
norm_corpus_02 = normalize_corpus(corpus_02)
print("Clean Corpus:", norm_corpus_02[10])

print('Step 4.2. Feature representative of each assignment group')
temp_08=func_feature_importance_by_class(temp_06,norm_corpus_01)

In [None]:
# Get Total count by assignment group
temp_11= temp_06.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
#consider only assignment group where total avaiable data points are more that 15
temp_11_list=temp_11[temp_11['Total_Count'] >2]['Assignment group']
temp_11=temp_06[temp_06['Assignment group'].isin(temp_11_list)]

print(temp_11.shape)
    
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['Description']
    

#===================== Experiment  groups with minimum sample Size 2 =================
X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf=func_tfidx_metric(corpus_model,temp_11)
X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)

classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
classfierdict={'model': [
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
data_1=model_training(classfierdict,X_tr_pca, X_te_pca, y_tr_tfidf, y_te_tfidf,'pca')
data_1

In [None]:
classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
data_1=model_training(classfierdict,X_tr_chi2, X_te_chi2, y_tr_tfidf, y_te_tfidf,'chi2')
data_1

In [None]:
#===================== Experiment  groups with minimum sample Size 20=================
# Get Total count by assignment group
temp_11= temp_06.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
#consider only assignment group where total avaiable data points are more that 15
temp_11_list=temp_11[temp_11['Total_Count'] >20]['Assignment group']
temp_11=temp_06[temp_06['Assignment group'].isin(temp_11_list)]

print(temp_11.shape)
    
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['Description']
    

#===================== Experiment  groups with minimum sample Size 2 =================
X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf=func_tfidx_metric(corpus_model,temp_11)
X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)

classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
classfierdict={'model': [
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
data_1=model_training(classfierdict,X_tr_pca, X_te_pca, y_tr_tfidf, y_te_tfidf,'pca')
data_1

In [None]:
classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
data_1=model_training(classfierdict,X_tr_chi2, X_te_chi2, y_tr_tfidf, y_te_tfidf,'chi2')
data_1

In [None]:
# Get Total count by assignment group
temp_11= temp_06.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
#consider only assignment group where total avaiable data points are more that 15
temp_11_list=temp_11[temp_11['Total_Count'] >2]['Assignment group']
temp_11=temp_06[temp_06['Assignment group'].isin(temp_11_list)]

print(temp_11.shape)
    
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['combined_description']
    

#===================== Experiment  groups with minimum sample Size 2 =================
X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf=func_tfidx_metric(corpus_model,temp_11)
X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)

classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
# Get Total count by assignment group
temp_11= temp_06.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
#consider only assignment group where total avaiable data points are more that 15
temp_11_list=temp_11[temp_11['Total_Count'] >50]['Assignment group']
temp_11=temp_06[temp_06['Assignment group'].isin(temp_11_list)]

print(temp_11.shape)
    
#create corpus for modelling
normalize_corpus = np.vectorize(func_doc_preprocess)
corpus_model = temp_11['combined_description']
    

#===================== Experiment  groups with minimum sample Size 2 =================
X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf=func_tfidx_metric(corpus_model,temp_11)
X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)

classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
print('Step 6. Model Execution...')
data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
data_1

In [None]:
import gensim
from gensim import corpora, models
lda_model = gensim.models.LdaMulticore(corpus_model, num_topics=10,  passes=2, workers=2)

In [None]:
def main():
    start = time.perf_counter()
    print('Step 1. Getting Input Data Information...')
    temp_03 = func_read_data_n_explore("input_data.xlsx")
    print("Step 2. Data Cleaning has started...")
    temp_04 = func_clean_data(temp_03)
    print("Step 3. Detecting text language...")
    temp_04['text_lang'] = temp_04['Description'].apply(func_detect_text_language)
    # Get language name
    lang = pd.read_csv('language-codes_csv.csv')
    temp_05 = pd.merge(temp_04, lang, left_on='text_lang', right_on='alpha2', how='left') 
    temp_05=temp_05.drop_duplicates(['Short description','Description','Assignment group'])
    # Combine description and short description columns
    temp_05.loc[:,'combined_description']=temp_05['Short description'].astype(str)+' '+temp_05['Description'].astype(str)
    print(temp_05.shape)
    print("Step 4. Data Explorarion...")
    print('')
    print('Data has - {} - ticket assignment groups'.format(len(temp_03['Assignment group'].unique())))
    print('')
    print('Tickets are registered in - {} languages '.format(len(temp_04['text_lang'].unique()))) 
    print('')
    func_vis01_target_analysis(temp_04)
    print('Step 4.1. NLP preprocessing')
    # get stopwords in english
    normalize_corpus = np.vectorize(func_doc_preprocess)
    # create corpus
    corpus_01 = []
    corpus_01 = temp_05['Description']
    corpus_01 = np.array(corpus_01)
    print("Sample Original Corpus:", corpus_01[10])    
    norm_corpus_01 = normalize_corpus(corpus_01)
    print("Clean Corpus:", norm_corpus_01[10])
    
    # Second corpus
    corpus_02 = []
    corpus_02 = temp_05['combined_description']
    corpus_02 = np.array(corpus_02)
    print("Sample Original Corpus:", corpus_02[10])    
    norm_corpus_01 = normalize_corpus(corpus_02)
    print("Clean Corpus:", norm_corpus_02[10])
    
    print('Step 4.2. Feature representative of each assignment group')
    temp_08=func_feature_importance_by_class(temp_05,norm_corpus_01)
    
    # Get Total count by assignment group
    temp_11= temp_05.groupby(['Assignment group'])['Assignment group'].count().to_frame('Total_Count').reset_index()
    #consider only assignment group where total avaiable data points are more that 15
    temp_11_list=temp_11[temp_11['Total_Count'] >15]['Assignment group']
    temp_11=temp_05[temp_05['Assignment group'].isin(temp_11_list)]
    
#     #create corpus for modelling
#     normalize_corpus = np.vectorize(func_doc_preprocess)
#     corpus_model = temp_11['Description']
    
    # Feature selection
    print('Step 5. Feature selection...')
    X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf=func_tfidx_metric(corpus_model,temp_11)
    
    classfierdict={'model': [MultinomialNB(),
                             LogisticRegression(solver='liblinear'),
                             KNeighborsClassifier(n_neighbors=3,weights='distance'),
                             RandomForestClassifier(n_estimators=50, random_state=42),
                             LGBMClassifier(objective='multiclass', random_state=5)]}
    
    print('Step 6. Model Execution...')
    data_1=model_training(classfierdict,X_tr_tfidf, X_te_tfidf, y_tr_tfidf, y_te_tfidf,'tfidf')
    
    
    X_tr_pca, X_te_pca = func_feature_selection_pca(X_tr_tfidf, X_te_tfidf)
    X_tr_chi2, X_te_chi2 = func_feature_selection_chi(X_tr_tfidf, y_tr_tfidf, X_te_tfidf)
    
    finish = time.perf_counter()
    print(f'Total Execution Time: Finished in {round(finish-start, 2)} second(s)')
    return temp_05, norm_corpus,temp_08,data_1
