In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split


In [24]:
########################### DATA LOADING #################################

train = pd.read_excel(r'C:\Users\K7857178\Desktop\73_Strings\Training Data.xlsx')
print(train.shape)
########################################################################

(2002, 8)


In [22]:
########################### FEATURE REMOVAL ##############################


In [25]:
train['Company Type'].value_counts()

Public Company    2002
Name: Company Type, dtype: int64

In [18]:
train['Company Status'].value_counts()

Operating    2002
Name: Company Status, dtype: int64

In [19]:
train['Geographic Locations'].value_counts()

United States of America (Primary)    2002
Name: Geographic Locations, dtype: int64

In [20]:
### SINCE ALL VALUES ARE SAME FOR 'Company Type','Company Status',''Geographic Locations''.
### THEREFORE REMOVING THEM AS THESE COLUMNS ARE NOT HELPFUL IN MODEL BUILDING.
train.drop(columns=['Company Type','Company Status','Geographic Locations'],inplace=True)

In [None]:
###############################################################################


In [None]:
########################## FEATURE IMPROVEMENT ##############################


In [26]:
## VALUES OF THESE TWO COLUMNS ARE NOT SAME
train['Exchange:Ticker'].equals(train['Security Tickers'])

False

In [28]:
## EXTRACTING ONLY FIRST TAG FROM CLASSIFICATION
train['Industry Classifications'] = train['Industry Classifications'].apply(lambda var : var.split(';')[0])
print(train['Industry Classifications'].head(2))

0                    Beauty Care Products (Primary)
1    Catalog Flowers, Gifts and Novelties (Primary)
Name: Industry Classifications, dtype: object


In [29]:
## REMOVING DUMMY STRING FROM CLASSFICATION
train['Industry Classifications'] = train['Industry Classifications'].apply(lambda var : var.replace(' (Primary)',''))
print(train['Industry Classifications'].head(2))

0                    Beauty Care Products
1    Catalog Flowers, Gifts and Novelties
Name: Industry Classifications, dtype: object


In [31]:
## CONSIDERING ONLY THE NECESSARY COLUMN
train = train[['Business Description','Industry Classifications']]
print(train.shape)

(2002, 2)


In [None]:
##############################################################################

In [None]:
######################### HANDLING MISSING/DUMMY VALUES ######################

In [32]:
## CHECK FOR MISSING VALUES
def draw_null_values_table(df):
    nullCount  = df.isnull().sum().sort_values(ascending=False)
    percentage = (df.isnull().sum().sort_values(ascending=False))*100/df.shape[0]
    missingTable = pd.concat([nullCount,percentage],axis=1,keys=['Total','Percentage'])
    return missingTable

draw_null_values_table(train)

Unnamed: 0,Total,Percentage
Industry Classifications,0,0.0
Business Description,0,0.0


In [33]:
## 32 ROWS HAVE VALUE '-'. 
train[train['Business Description'] == '-'].shape

(32, 2)

In [35]:
## REMOVING THEM AS THE COUNT IS LESS
train = train[train['Business Description'] != '-']
print(train.shape)

(1970, 2)


In [None]:
###################################################################################

In [None]:
############################## CLEANING TEXT ##################################

In [36]:
import re

def clean_text(text):
    ## REMOVING THE CHARACTERS [\], ['] and ["]
    text = re.sub(r"\\"," ",text)
    text = re.sub(r"\""," ",text)
    text = re.sub(r"\'"," ",text)
    
    ## REMOVING SINGLE CHARACTERS WITH A SPACE
    text = re.sub(r"\s+[a-zA-Z]\s+"," ",text)
    
    ## REMOVING MULTIPLE SPACES WITH SINGLE SPACE
    text = re.sub(r"\s+"," ",text)

    ## CONVERTING TEXT TO LOWERCASE    
    text = text.strip().lower()
    
    ## REPLACING PUNCTUATION CHARACTERS WITH SPACES
    punctuations    = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict  = dict((char, " ") for char in punctuations)
    translate_map   = str.maketrans(translate_dict)
    text            = text.translate(translate_map)
    
    return text

train['Business Description'] = train['Business Description'].apply(lambda var : clean_text(var))

In [None]:
##############################################################################

In [None]:
######################### PERFORMING LEMMATIZATION ##########################

In [37]:

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lammetize_text(txt):
    words = word_tokenize(txt)
    val = [lemmatizer.lemmatize(w) for w in words]
    seperator = ' '
    return(seperator.join(val))
    
train['Business Description'] = train['Business Description'].apply(lambda var : lammetize_text(var))

In [None]:
##############################################################################

In [None]:
######################## PERFORMING BAG OF WORDS MATRIX ###################

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english",max_features=15000)
X_vectors   = count_vect.fit_transform(train['Business Description']).toarray()

In [None]:
###########################################################################

In [None]:
####################### TRAIN TEST SPLIT ##############################

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vectors, train['Industry Classifications'], test_size=0.3, random_state=0)

In [None]:
#######################################################################

In [None]:
########################### MODEL BUILDING ##############################

In [40]:
from sklearn.metrics import accuracy_score
def check_model_accuray(X_train,y_train,X_test,y_test,model):
    model.fit(X_train,y_train)
    predict = model.predict(X_test)
    acc = accuracy_score(y_test, predict)
    print("Accuracy on the dataset is: {:.2f}".format(acc*100))

    
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
check_model_accuray(X_train,y_train,X_test,y_test,model)

Accuracy on the dataset is: 52.79


In [None]:
#########################################################################

In [None]:
######################### SERIALIZING THE MODEL #########################

In [41]:
import pickle
file = open('US_COMPANIES_CLASSIFICATION.pkl', 'wb')
pickle.dump(model,file,protocol=2)
file.close()

In [None]:
#########################################################################