# Text Classification Demo

To test a text classification to see if it can match IFS PartNo with G+ types

In [2]:
# use natural language toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()

import pandas as pd
import csv
from sklearn import preprocessing

### Import Training Data

Contains Class as PartNo, Sentence as ModelDescription. 
Convert to DataFrame

In [3]:
#trainingData = pd.read_csv('C:\\Users\\godes\\Desktop\\OFV_REGISTRERINGER_VIN_SAMI\\Mercedes_Code_Description_Ddb.csv', sep =',')
trainingData = pd.read_csv('C:\\Users\\godes\\Desktop\\OFV_REGISTRERINGER_VIN_SAMI\\Mercedes_FullSeries_Description_AddedRows.csv', sep =';')
#print(trainingData)

In [4]:
# capture unique words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([row['PartNo'] for index, row in trainingData.iterrows()]))

for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

# loop through each sentence in our training data
for index, row in trainingData.iterrows():
    # tokenize each sentence into words
    for word in nltk.word_tokenize(row['ModelDescription']):               
        #lowercase each word
        word_lower = word.lower()
        # have we not seen this word already?
        if word_lower not in corpus_words:
            corpus_words[word_lower] = 1
        else:
            corpus_words[word_lower] += 1

        # add the word to our words in class list
        class_words[row['PartNo']].extend([word_lower])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
#print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)

Class words: {'B160CDI': ['mercedes-benz', 'b-klasse', 'b160cdi'], 'GLA180 AMG': ['mercedes-benz', 'gla-klasse', 'gla180', 'amg'], 'G63AMG V': ['mercedes-benz', 'g-wagen', 'g63amg', 'v'], 'S400D L 4M': ['mercedes-benz', 's-klasse', 's400d', 'l', '4m'], 'W516/43L KA   SB': ['mercedes-benz', 'sprinter', 'w516/43l', 'ka', 'sb'], 'V119PRO A2 4M SP': ['mercedes-benz', 'vito', 'v119pro', 'a2', '4m', 'sp'], 'B180CDI T': ['mercedes-benz', 'b-klasse', 'b180cdi', 't'], 'V114 A3       SV': ['mercedes-benz', 'vito', 'v114', 'a3', 'sv'], 'GLC250DC AMGP': ['mercedes-benz', 'glc-klasse', 'glc250dc', 'amgp'], 'A200 AMG': ['mercedes-benz', 'a-klasse', 'a200', 'amg'], 'GLA180 URBP': ['mercedes-benz', 'gla-klasse', 'gla180', 'urbp'], 'AMG GLE43C': ['mercedes-benz', 'gle-klasse', 'amg', 'gle43c'], '319/37 KA     SV': ['mercedes-benz', 'sprinter', '319/37', 'ka', 'sv'], 'AMG SLC43': ['mercedes-benz', 'slc-klasse', 'amg', 'slc43'], 'E200T T': ['mercedes-benz', 'e-klasse', 'e200t', 't'], 'B200NGT': ['mercede

In [10]:
print ("Class words: %s" % class_words['A250 SPORT 4M'])

Class words: ['mercedes-benz', 'a-klasse', 'a250', 'sport', '4m']


In [6]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=False):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if word.lower() in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[word.lower()])

            if show_details:
                print ("   match: %s (%s)" % (word.lower(), 1 / corpus_words[word.lower()]))
    return score

In [7]:
# return the class with highest score for sentence
def classify(sentence, details=False):
    high_class = None
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score_commonality(sentence, c, show_details=details)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    #return high_class, high_score
    return {"class":high_class, "score":high_score, "sentence": sentence}

### Test the solution

In [8]:
#classify("make me some lunch?")
#classify("talk to you tomorrow")
#classify("Citroen Berlingo BHDi PROFF 100 man. L1")
classify("A200d,")

{'class': 'A200', 'score': 1.0, 'sentence': 'A200d,'}

## Create Prediction Set

In [11]:
spamReader = csv.reader(open('C:\\Users\\godes\\Desktop\\OFV_REGISTRERINGER_VIN_SAMI\\Gjensidige_Description.csv', 
                             newline=''), delimiter=',', quotechar='|')

test_data = []

for row in spamReader:
    test_data.append(row[0])

#remove frist class as it is the csv headers
del test_data[0]

print(len(test_data))
#print(test_data)

77
['A160', 'A160 aut', 'A160d', 'A160d aut', 'A180', 'A180 aut', 'A180d', 'A180d aut', 'A200', 'A200 aut', 'A200d', 'A200d aut', 'A200d 4MATIC aut', 'A220 4MATIC', 'A220d', 'A220d 4MATIC aut', 'A250', 'A250 aut', 'A250 Sport', 'A250 Sport aut', 'A250 Sport 4MATIC aut.', 'A250 4MATIC aut', 'A160', 'A160 aut', 'A160d', 'A160d aut', 'A180', 'A180 aut', 'A180d', 'A180d aut', 'A200', 'A200 aut', 'A200d', 'A200d aut', 'A200d 4MATIC aut', 'A220 4MATIC', 'A220d', 'A220d 4MATIC aut', 'A250', 'A250 aut', 'A250 Sport', 'A250 Sport aut', 'A250 Sport 4MATIC aut.', 'A250 4MATIC aut', 'A160', 'A160 aut', 'A160d', 'A160d aut', 'A180', 'A180 aut', 'A180d', 'A180d aut Kombicoupe', 'A180d aut Kombicoupe', 'A200', 'A200 aut Kombicoupe', 'A200 aut Kombicoupe', 'A200 AMG Edition aut', 'A200 AMG Edition aut (K)', 'A200 AMG Edition Plus aut', 'A200 AMG Edition Plus aut (K)', 'A200 Progressive Edition aut', 'A200 Progressive Edition aut (K)', 'A200 Progressive Edition Plus aut', 'A200 Progressive Edition Plus

In [12]:
# Score all test data
result = []
for s in test_data:
    r = classify(s)
    result.append(r)


df = pd.DataFrame(result)
print(df)


            class     score                               sentence
0            A160  1.000000                                   A160
1            A160  1.000000                               A160 aut
2            A160  1.000000                                  A160d
3            A160  1.000000                              A160d aut
4            A180  0.333333                                   A180
5            A180  0.333333                               A180 aut
6           A180D  1.000000                                  A180d
7           A180D  1.000000                              A180d aut
8        A200 AMG  0.166667                                   A200
9        A200 AMG  0.166667                               A200 aut
10           A200  1.000000                                  A200d
11           A200  1.000000                              A200d aut
12           A200  1.000000                       A200d 4MATIC aut
13        A220 4M  0.642857                            A220 4M

In [117]:
#Save result
df.to_csv("C:\\Users\\godes\\Desktop\\OFV_REGISTRERINGER_VIN_SAMI\\ResultsFromJupyter\\output.csv",sep = ";")

In [28]:
#normalize result

x = df['score'].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x = x.reshape(-1,1)
x_scaled = min_max_scaler.fit_transform(x)
#print(x_scaled)

#df_test = pd.DataFrame(x_scaled)
#df_test = df.append(pd.DataFrame(x_scaled))

print(df)

            class     score                               sentence
0            A160  1.000000                                   A160
1            A160  1.000000                               A160 aut
2            A160  1.000000                                  A160d
3            A160  1.000000                              A160d aut
4            A180  0.333333                                   A180
5            A180  0.333333                               A180 aut
6           A180D  1.000000                                  A180d
7           A180D  1.000000                              A180d aut
8        A200 AMG  0.166667                                   A200
9        A200 AMG  0.166667                               A200 aut
10           A200  1.000000                                  A200d
11           A200  1.000000                              A200d aut
12           A200  1.000000                       A200d 4MATIC aut
13        A220 4M  0.642857                            A220 4M

In [118]:
count = {}

for data in result:
    c = data['class']
    if c not in count:
        count[c] = 1
    else:
        count[c] += 1    
        
print("Total count:", count)

Total count: {'A160': 12, 'A180 AMGP': 12, 'A180D': 7, 'A200 PRO': 7, 'A200': 9, 'A220 4M': 9, 'A250 4M': 10, 'A250 SPORT 4M': 9, 'AT EDITION P': 2}
