In [76]:
import pandas as pd
import re
import pickle
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
def extractFeatures(df):
    tfidf = TfidfVectorizer(encoding='latin-1', 
                            max_df=0.9288781834957359,
                            min_df=2, 
                            ngram_range=(1, 3),
                            sublinear_tf=True, 
                            token_pattern='\\w{1,}')

    features = tfidf.fit_transform(df.word).toarray()
    labels = df.dataType_id

    #Save vectorizer.vocabulary_
    pickle.dump(tfidf.vocabulary_,open("../modelOutput/TfidfVocabulary.pkl","wb"))

    return features,labels

In [78]:
def tryModels(features , labels):
    models = [
              #RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
              #LinearSVC(),
              MultinomialNB(alpha=0.3382826838838474),
              #LogisticRegression(random_state=0)
              ]
    folds_no = 5
    cv_df = pd.DataFrame(index=range(folds_no * len(models)))
    entries = []
    for model in models:
      model_name = model.__class__.__name__
      accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=folds_no)
      for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
      print("finsihed cross validation for model: ",model_name)
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    return cv_df

In [79]:
def extractNewWordFeatures(word):
    oldtfidf = pickle.load(open("../modelOutput/TfidfVocabulary.pkl", 'rb'))
    
    #print(oldtfidf)
    tfidf = TfidfVectorizer(encoding='latin-1', 
                            max_df=0.4662381347098422,
                            min_df=2, 
                            ngram_range=(1, 3),
                            sublinear_tf=True, 
                            token_pattern='\\w{1,}',
                            vocabulary = oldtfidf)
    

    features = tfidf.fit_transform(word)
    return features

In [80]:
def cleanColName(colName):
    #remove names with one character 
    if len(colName) <= 1:
        return colName.lower()
    
    #remove names that are only numbers
    result = re.search("^[ 0-9]+$", colName)
    if result is not None:
        return colName
    
    #remove numbers at the end of the string 
    colName = re.sub("[0-9]+$", '', colName)
    
    #replace _ with space
    colName = colName.replace("_"," ")
    
    #if all capital cases make it small or if it seperated by spaces
    if colName.isupper() == True or len(colName.split()) > 1:
        colName = colName.lower()
    
    #seperate pascal
    if re.search('^[a-z]+[A-Z]+', colName) is not None:
        words = re.findall('[A-Z][^A-Z]*', colName)
        if len(words) > 1 and len(words[0]) == 1 and len(words[1]) == 1:
            result = ''.join(words) 
        else:
            result = ' '.join(words) 
        colName = colName.split(re.findall('[A-Z][^A-Z]*', colName)[0])[0]+' ' + result
        colName = colName.lower()
    
    #seperate camal cases
    if len(re.findall('[A-Z][^A-Z]*', colName)) >0:
        words = re.findall('[A-Z][^A-Z]*', colName)
        newWords = []  
        i = 0 
        while i<len(words):
            if i+1<len(words) and len(words[i])==1 and len(words[i+1])==1:
                newWords.append(words[i]+words[i+1])
                i+=1
            else:
                newWords.append(words[i])
            i+=1
        colName = ' '.join(newWords)
      
    #remove multiple spaces
    colName = ' '.join(colName.split())
    
    #convert to lower case
    colName = colName.lower()
    return colName
    


In [81]:
def predictWordsTypes(words,model):
    cleaned_words = [cleanColName(i) for i in words]

    features = extractNewWordFeatures(cleaned_words)
    preds = model.predict(features)

    outTypes = [id_to_category[i] for i in preds]
    dic = dict(zip(words,outTypes))
    return dic

<h1>Read Dataset</h1>

In [82]:
many_types_4_py_df = pd.read_csv("../datasets/manytypes4py/data.csv")
many_types_4_py_df.head()

sql_df = pd.read_csv("../datasets/schemas/data.csv")
sql_df.head()


Unnamed: 0,word,dataType
0,country name,STR
1,country code,STR
2,indicator name,STR
3,indicator code,STR
4,sales order id,INTEGER


<h1>Prepaire DatatFrame</h1>

<h2>Merge datasets</h2>

In [83]:
#sql_df = sql_df[np.isin(sql_df, ['TIME','BOOL']).any(axis=1)]
#sql_df = sql_df.replace(['TIME','BOOL'],['datetime','bool'])
#sql_df.head()

df = pd.concat([sql_df,many_types_4_py_df])
df.head()
df = df.replace(['STR','INTEGER','TIME','FLOAT','BINARY'],['str','int','datetime','float','bool'])

#df = many_types_4_py_df

<h3>Prepair dataframe :</h3>

In [84]:
df['dataType_id'] = df['dataType'].factorize()[0]
category_id_df = df[['dataType', 'dataType_id']].drop_duplicates().sort_values('dataType_id')
category_to_id = dict(category_id_df.values)
id_to_category = {v: k for k, v in category_to_id.items()}
print(category_to_id)
print(id_to_category)
df.head()
df = df.dropna()
df = df.drop_duplicates(['word','dataType'] , 'first')

#duplicate = df[df.duplicated(['word'] , False)]
#sorted_dub = duplicate.sort_values(by=['word'])
#print(len(sorted_dub))
#sorted_dub.to_csv("duplicates_in_df.csv")
#df.to_csv("df.csv")


{'str': 0, 'int': 1, 'datetime': 2, 'float': 3, 'bool': 4}
{0: 'str', 1: 'int', 2: 'datetime', 3: 'float', 4: 'bool'}


dumb dic of data:

In [85]:
json.dump(id_to_category, open("../modelOutput/id_to_category.txt",'w'))


In [86]:
X = df.word
Y = df["dataType_id"]
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size =0.25,stratify = Y)

train = pd.concat([X_train, Y_train], axis=1)
test = pd.concat([X_test, Y_test], axis=1)


In [87]:
#TF IDF
features , labels = extractFeatures(train)


In [88]:
#cv_df = tryModels(features , labels)
model = MultinomialNB(alpha=0.06926447564538141)
model.fit(features,labels)
pickle.dump( model, open( "../modelOutput/model.sav", "wb" ))
#accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=3)


<h1>Calculate accuracy:</h1>

In [89]:
features = extractNewWordFeatures(test.word)
true_preds = test["dataType_id"].values 
preds = model.predict(features)

print(preds)
print(true_preds)

accuracy = (sum(preds == true_preds) / len(preds))*100
print("test set accuracy = ",accuracy)


[0 0 1 ... 0 0 1]
[4 1 1 ... 0 1 3]
test set accuracy =  72.95913825336063


<h1>Predict:</h1>

In [90]:
words = ["name","sex","birth_date","relationship","hours","name","budget","location","locations","ssn","status","salary","address","first_name","middle_initial","last_name","start_date"]
model = pickle.load(open("../modelOutput/model.sav", 'rb'))
print(predictWordsTypes(words,model))

{'name': 'str', 'sex': 'str', 'birth_date': 'datetime', 'relationship': 'str', 'hours': 'int', 'budget': 'float', 'location': 'str', 'locations': 'str', 'ssn': 'str', 'status': 'str', 'salary': 'float', 'address': 'str', 'first_name': 'str', 'middle_initial': 'str', 'last_name': 'str', 'start_date': 'datetime'}
