In [28]:
import re
import pickle
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
def extractNewWordFeatures(word):
    oldtfidf = pickle.load(open("TfidfVocabulary.pkl", 'rb'))
    
    #print(oldtfidf)
    tfidf = TfidfVectorizer(encoding='latin-1', 
                            max_df=0.4662381347098422,
                            min_df=2, 
                            ngram_range=(1, 3),
                            sublinear_tf=True, 
                            token_pattern='\\w{1,}',
                            vocabulary = oldtfidf)
    

    features = tfidf.fit_transform(word)
    return features

In [30]:
def cleanColName(colName):
    #remove names with one character 
    if len(colName) <= 1:
        return colName
    
    #remove names that are only numbers
    result = re.search("^[ 0-9]+$", colName)
    if result is not None:
        return colName
    
    #remove numbers at the end of the string 
    colName = re.sub("[0-9]+$", '', colName)
    
    #replace _ with space
    colName = colName.replace("_"," ")
    
    #if all capital cases make it small or if it seperated by spaces
    if colName.isupper() == True or len(colName.split()) > 1:
        colName = colName.lower()
    
    #seperate pascal
    if re.search('^[a-z]+[A-Z]+', colName) is not None:
        words = re.findall('[A-Z][^A-Z]*', colName)
        if len(words) > 1 and len(words[0]) == 1 and len(words[1]) == 1:
            result = ''.join(words) 
        else:
            result = ' '.join(words) 
        colName = colName.split(re.findall('[A-Z][^A-Z]*', colName)[0])[0]+' ' + result
        colName = colName.lower()
    
    #seperate camal cases
    if len(re.findall('[A-Z][^A-Z]*', colName)) >0:
        words = re.findall('[A-Z][^A-Z]*', colName)
        newWords = []  
        i = 0 
        while i<len(words):
            if i+1<len(words) and len(words[i])==1 and len(words[i+1])==1:
                newWords.append(words[i]+words[i+1])
                i+=1
            else:
                newWords.append(words[i])
            i+=1
        colName = ' '.join(newWords)
      
    #remove multiple spaces
    colName = ' '.join(colName.split())
    
    #convert to lower case
    colName = colName.lower()
    return colName
    


In [31]:
def predictWordsTypes(words,model,id_to_category):
    cleaned_words = [cleanColName(i) for i in words]

    features = extractNewWordFeatures(cleaned_words)
    preds = model.predict(features)
    outTypes = [id_to_category[str(i)] for i in preds]
    dic = dict(zip(words,outTypes))
    return dic

<h1>Load model and types dictionary:</h1>

In [32]:
model = pickle.load(open("model.sav", 'rb'))
id_to_category = json.load(open("id_to_category.txt"))

<h1>Predict:</h1>

In [33]:
words = ["name","sex","birth_date","relationship","hours","name","budget","location","locations","ssn","status","salary","address","first_name","middle_initial","last_name","start_date"]
print(predictWordsTypes(words,model,id_to_category))

{'name': 'str', 'sex': 'str', 'birth_date': 'datetime', 'relationship': 'str', 'hours': 'str', 'budget': 'str', 'location': 'str', 'locations': 'str', 'ssn': 'str', 'status': 'str', 'salary': 'float', 'address': 'str', 'first_name': 'str', 'middle_initial': 'str', 'last_name': 'str', 'start_date': 'datetime'}
