## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender 
+ Sklearn
+ Pandas
+ Text Extraction

# EDA packages

In [22]:
# EDA packages
import pandas as pd
import numpy as np


In [23]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


# Load our data

In [24]:
# Load our data
df = pd.read_excel(r"C:\Users\Jebbar Abdelkader\Desktop\Garbage\Dataset.xlsx")

# shape of our data

In [25]:
df.shape

(563914, 2)

In [26]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['name', 'sex'], dtype='object')

In [27]:
# Data Types
df.dtypes

name    object
sex     object
dtype: object

# Checking for Missing Values

In [28]:
# Checking for Missing Values
df.isnull().isnull().sum()

name    0
sex     0
dtype: int64

In [29]:
# Number of Female Names
df[df.sex == 'F'].shape

(157669, 2)

In [30]:
# Number of Male Names
df[df.sex == 'M'].shape

(406245, 2)

In [31]:
df[df.sex == 'N'].shape

(0, 2)

In [32]:
df_names = df

In [33]:
# Replacing All F,M and N with 0, 1 and 2 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [34]:
df_names.sex.unique()

array([1, 0], dtype=int64)

In [35]:
df_names.dtypes

name    object
sex      int64
dtype: object

In [36]:
Xfeatures =df_names['name']

# Feature Extraction 

In [37]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [38]:
cv.get_feature_names()

['_mohammed',
 '_s',
 'aababou',
 'aaballa',
 'aaban',
 'aabana',
 'aabas',
 'aabaslama',
 'aabassi',
 'aabbar',
 'aabbas',
 'aabbassy',
 'aabbou',
 'aabbouki',
 'aabd',
 'aabdati',
 'aabdelatif',
 'aabdeljalil',
 'aabdelouahab',
 'aabderahman',
 'aabdi',
 'aabdouni',
 'aabed',
 'aabedi',
 'aabedin',
 'aabedy',
 'aabella',
 'aabha',
 'aabi',
 'aabiba',
 'aabibi',
 'aabid',
 'aabida',
 'aabidi',
 'aabidine',
 'aabir',
 'aabirate',
 'aabirouche',
 'aabirrouche',
 'aablla',
 'aabou',
 'aaboud',
 'aabriella',
 'aachar',
 'aachati',
 'aachchaqui',
 'aachir',
 'aachor',
 'aachour',
 'aachraoui',
 'aada',
 'aadaim',
 'aadam',
 'aadan',
 'aadaoui',
 'aadarsh',
 'aaddi',
 'aaddou',
 'aadel',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadiati',
 'aadil',
 'aadile',
 'aadim',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadmi',
 'aadnan',
 'aadon',
 'aadoui',


In [39]:
from sklearn.model_selection import train_test_split

In [40]:
# Features 
X
# Labels
y = df_names.sex

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Naive Bayes Classifier

In [60]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.8783153539234521

# Accuracy of our Model

In [61]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 87.8315353923452 %


In [62]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 98.47975497733947 %


### Sample Prediction

In [63]:
# Sample1 Prediction
sample_name = ["fatiha"]
vect = cv.transform(sample_name).toarray()

In [64]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [65]:
# Female is 0, Male is 1,none is 2
clf.predict(vect)

array([0], dtype=int64)

In [66]:
# Sample2 Prediction
sample_name1 = ["mhammed atae allah"]
vect1 = cv.transform(sample_name1).toarray()

In [67]:
clf.predict(vect1)

array([1], dtype=int64)

In [68]:
# Sample3 Prediction of Russian Names
sample_name2 = ["fateh"]
vect2 = cv.transform(sample_name2).toarray()

In [69]:
clf.predict(vect2)

array([1], dtype=int64)

# Sample Prediction of Random Names

In [70]:
# Sample3 Prediction of Random Names
sample_name3 = ["kabir"]
vect3 = cv.transform(sample_name3).toarray()

In [71]:
clf.predict_proba(vect3)

array([[0.01812307, 0.98187693]])

In [72]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [73]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Male
None
Female
None


### Using a custom function for feature analysis

In [74]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [75]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [76]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [77]:
df_y = df_names['sex']

In [78]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [79]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [80]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=33)

In [81]:
dfX_train

array([{'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'sal', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'},
       {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'n', 'last2-letters': 'in', 'last3-letters': 'min'},
       {'first-letter': 'm', 'first2-letters': 'mi', 'first3-letters': 'mim', 'last-letter': 'e', 'last2-letters': 'ne', 'last3-letters': 'une'},
       ...,
       {'first-letter': 'h', 'first2-letters': 'ho', 'first3-letters': 'hou', 'last-letter': 'a', 'last2-letters': 'da', 'last3-letters': 'uda'},
       {'first-letter': 'a', 'first2-letters': 'ah', 'first3-letters': 'ahm', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'},
       {'first-letter': 'd', 'first2-letters': 'da', 'first3-letters': 'day', 'last-letter': 'n', 'last2-letters': 'en', 'last3-letters': 'ven'}],
      dtype=object)

In [82]:

dv = DictVectorizer()
dv.fit_transform(dfX_train)


<377822x10380 sparse matrix of type '<class 'numpy.float64'>'
	with 2266932 stored elements in Compressed Sparse Row format>

# data of names 

In [615]:
#female=pd.read_excel(r"C:\Users\Jebbar Abdelkader\Desktop\Gender-prediction\dataset_garbage\ListeFE.xlsx")

#DGI_prenom=pd.read_excel(r"C:\Users\Jebbar Abdelkader\Desktop\Prénoms_DGI_sans_numéros_Abderrahmane.xlsx")

In [83]:
pp_19_stock=pd.read_excel(r"C:\Users\Jebbar Abdelkader\Desktop\confidentialite\Base 2019\Personnes physiques\pp_19_stock.xlsx")

In [85]:
pp_19_stock.head()

Unnamed: 0,IDENTIFIANT_FISCAL,IDENTIFIANT_COMMUN_ENTREPRISE,RC_unique,NUMERO_DU_REGISTRE_DU_COMMERCE,CODE_CENTRE_REGISTRE_COMMERCE,NUMERO_DE_LA_CNSS,NOM_OU_RAISON_SOCIALE,ADRESSE,CODE_ANNEE,ID_TECHNIQUE,...,Nombre_employés_12_2019,Tranches_ca,LIB_REGION,Activité,Tranches_CA,Libellé Section,REGION,Tranches_CA_18,Tranches_CA_17,tranche_17_18_19
0,25225478.0,2073931000000.0,0000_NA,0,,1000101.0,TARBI MINA,N2 BAB ZORGANE 1 ER ETAGE IMM CHERIF,2019,0a2c184fbfdd84ba517e4c080199949e,...,3.0,,Souss-Massa,8622,,SANTÉ HUMAINE ET ACTION SOCIALE,Souss-Massa,"]0 , 1]",,"]0 , 1]"
1,73662580.0,1164771000000.0,18172_NA,18172,,1000125.0,TOUILE AHMED,DOUAR KASBA LAMHADI EL GUERDANE,2019,a91f2e3831615918b1716135b1b7140e,...,1.0,,Souss-Massa,3314,,INDUSTRIE MANUFACTURIÈRE,Souss-Massa,,CA NULL,CA NULL
2,78000180.0,1848616000000.0,3986_NA,3986,,1000199.0,OUTADRART LAHOUCINE,SOUK TNINE TAFINGOULT OULAD BERHIL,2019,51e1ed44dbd3e5bbd07f3ee7ff765b0f,...,1.0,,Souss-Massa,4729,,COMMERCE ; RÉPARATION D'AUTOMOBILES ET DE MOTO...,Souss-Massa,"]1 , 3]",,"]1 , 3]"
3,77635319.0,2133845000000.0,7816_66,7816,66.0,1000214.0,MAHRAJ HASSAN,KISSARIAT AL MANIRA JNANE JAMAA,2019,f33bfe2dd61905596bf73def430cbc8b,...,,"]0 , 1]",Souss-Massa,4778,"]0 , 1]",COMMERCE ; RÉPARATION D'AUTOMOBILES ET DE MOTO...,Souss-Massa,,"]0 , 1]","]0 , 1]"
4,40259590.0,2132185000000.0,17570_NA,17570,,1000238.0,AMZIL AHMED,BLOC B N 56 HAY ASSALAM AIT IAAZA,2019,28e5e3012f5630820f4d6cae91e2c62c,...,1.0,,Souss-Massa,3250,,INDUSTRIE MANUFACTURIÈRE,Souss-Massa,,"]0 , 1]","]0 , 1]"


In [107]:
pp_19_stock.shape

(143305, 87)

In [157]:
142793-143305

-512

In [108]:
#pp_19_stock['name']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:x.str.replace('’','').lower().strip())

In [114]:
pp_19_stock['NOM_OU_RAISON_SOCIALE']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].replace(np.nan,'Non renseignée')

In [115]:
import unidecode

In [116]:
#for ind in pp_19_stock['NOM_OU_RAISON_SOCIALE']:
   # try:
     #   print(int(ind))
    #except:
        
       # pp_19_stock['name']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].replace(ind,unidecode.unidecode(ind))
        
    
    


In [117]:
list_noms=set(pp_19_stock['NOM_OU_RAISON_SOCIALE'].to_list())

In [118]:
list_prenoms=[]
list_siege_sociale=[]
for i in list_noms:
    try :
        list_siege_sociale.append(int(i))     
    except:
        list_prenoms.append(i)
print(len(list_prenoms))
print(len(list_siege_sociale))

136795
1


In [119]:
list_prenoms

['KHMIRA',
 'CHATO HOCIN',
 'GUANOUNI MUSTAPHA',
 'ABOUZIDANE MOHAMMED',
 'ZAIDANE ABDERRAZZAK',
 'MOUTAOUKIL BOUJEMEA',
 'GASSA BADR',
 'TAKLANI ABDELLATIF PP',
 'MR  SAHA MOHAMMED',
 'MLLE ALAOUI LALLA BOUCHRA',
 'BIDA ALI',
 'REDA FATHMI MOHAMED',
 'AREQTI ABDELLAH',
 'EL MOUTAOUKIL LAHCEN',
 'HOMMANY REDOUANE',
 'EL KANDOUSSI  LOUBNA',
 'FARHAN ABDELLATIF',
 'EL YAZRI ABDELLATIF',
 'BINIZ ALI',
 'LAMARTI LARBI',
 'OFQUIR SAID',
 'SBIRI ALAA',
 'MR KTIRI MOHAMED',
 'HERCULES-III',
 'MESROUR RACHID',
 'CHOUAIBI JAMAL EDDINE',
 'SALMOUN AHMED',
 'EL ABRARI JAOUHARI SIDI JAMAL',
 'HIJJA 4',
 'EL FANNIRI ABDELKADER',
 'DARIF ALI',
 'AMERNI MOHAMED',
 'BOUDYAMAN ABDELALI',
 'ZAALOUK SAID',
 'OUBERKOUCH BOUBKER',
 'MARHABA DEUX',
 'CHIBANE EL HOUSSAINE',
 'BOUAZZAH SAIDA  CAFE PARISTA',
 'DARMOUCH AMINA',
 'MR ADERGHAL LAHOUCINE',
 'DRIHEM AZIZ',
 'LAHRECH SONIA',
 'AMANOUZ AHMED',
 'NASSIRI JAMAL',
 'AZOUR ABDELAZIZ',
 'EL HAJOUI GHZIEL SAMIRA',
 'BEN DAIF BOUSSELHAM',
 'EL KASSIBI MALIK

# Function for naive bayes

In [120]:
# A function to do it
def genderpredictorNB(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        return "Female"
    elif clf.predict(vector) == 1:
        return "Male"
    
    else:
        return "None"

In [121]:
pp_19_stock['gender_NB']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorNB(x))

In [122]:
#DGI_prenom['gender_NB']= DGI_prenom['name'].apply(lambda x:genderpredictorNB(x))

In [124]:
# probability predict
def genderpredictorNB_P(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        return abs(clf.predict_proba(vector)[0][0])
    elif clf.predict(vector) == 1:
        return abs(clf.predict_proba(vector)[0][1])
    
    else:
        return abs(clf.predict_proba(vector)[0][2])

In [125]:
pp_19_stock['proba_predict_NB']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorNB_P(x))

In [None]:
#DGI_prenom['proba_predict_NB']=DGI_prenom['name'].apply(lambda x:genderpredictorNB_P(x))

# Model building Using DecisionTree

In [126]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)


DecisionTreeClassifier()

In [127]:
#Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
#Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
 # Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.9873617735335687
0.9606323753842185


# A function od decision tree

In [128]:

def genderpredictorDT(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        return "Female"
    elif dclf.predict(vector) == 1:
        return "Male"
    else:
        return "None"

In [129]:
pp_19_stock['gender_DT']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorDT(x))

In [130]:
#DGI_prenom['gender_DT']=DGI_prenom['name'].apply(lambda x:genderpredictorDT(x))

In [131]:
# probability predict
def genderpredictorDT_P(a):
    test_name = [a]
    vector = dv.transform(features(test_name)).toarray()
    if dclf.predict(vector) == 0:
        return abs(dclf.predict_proba(vector)[0][0])
    elif dclf.predict(vector) == 1:
        return abs(dclf.predict_proba(vector)[0][1])
    
    else:
        return abs(dclf.predict_proba(vector)[0][2])

In [132]:
pp_19_stock['proba_predict_DT']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorDT_P(x))

In [133]:
#DGI_prenom['proba_predict_DT']=DGI_prenom['name'].apply(lambda x:genderpredictorDT_P(x))

# Create Random Forest Model

In [135]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
Rclf=RandomForestClassifier(n_estimators=120)

#Train the model using the training sets y_pred=clf.predict(X_test)
Rclf.fit(my_xfeatures, dfy_train)     


RandomForestClassifier(n_estimators=120)

# A function to do it

In [136]:
def genderpredictorRF(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if Rclf.predict(vector) == 0:
        return "Female"
    elif Rclf.predict(vector) == 1:
        return "Male"
    else:
        return "None"

In [137]:
pp_19_stock['gender_RF']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorRF(x))

In [138]:
#DGI_prenom['gender_RF']=DGI_prenom['name'].apply(lambda x:genderpredictorRF(x))

In [139]:
# probability predict
def genderpredictorRF_P(a):
    test_name = [a]
    vector = dv.transform(features(test_name)).toarray()
    if Rclf.predict(vector) == 0:
        return abs(Rclf.predict_proba(vector)[0][0])
    elif Rclf.predict(vector) == 1:
        return abs(Rclf.predict_proba(vector)[0][1])
    
    else:
        return abs(Rclf.predict_proba(vector)[0][2])

In [142]:
pp_19_stock['proba_predict_RF']=pp_19_stock['NOM_OU_RAISON_SOCIALE'].apply(lambda x:genderpredictorRF_P(x))

In [143]:
#DGI_prenom['proba_predict_RF']=DGI_prenom['name'].apply(lambda x:genderpredictorRF_P(x))

In [145]:
pp_19_stock

Unnamed: 0,IDENTIFIANT_FISCAL,IDENTIFIANT_COMMUN_ENTREPRISE,RC_unique,NUMERO_DU_REGISTRE_DU_COMMERCE,CODE_CENTRE_REGISTRE_COMMERCE,NUMERO_DE_LA_CNSS,NOM_OU_RAISON_SOCIALE,ADRESSE,CODE_ANNEE,ID_TECHNIQUE,...,Tranches_CA_18,Tranches_CA_17,tranche_17_18_19,name,gender_NB,proba_predict_NB,gender_DT,proba_predict_DT,gender_RF,proba_predict_RF
0,25225478.0,2.073931e+12,0000_NA,0000,,1000101.0,TARBI MINA,N2 BAB ZORGANE 1 ER ETAGE IMM CHERIF,2019,0a2c184fbfdd84ba517e4c080199949e,...,"]0 , 1]",,"]0 , 1]",TARBI MINA,Female,0.974199,Female,1.000000,Female,1.000000
1,73662580.0,1.164771e+12,18172_NA,18172,,1000125.0,TOUILE AHMED,DOUAR KASBA LAMHADI EL GUERDANE,2019,a91f2e3831615918b1716135b1b7140e,...,,CA NULL,CA NULL,TOUILE AHMED,Male,0.995242,Male,1.000000,Male,1.000000
2,78000180.0,1.848616e+12,3986_NA,3986,,1000199.0,OUTADRART LAHOUCINE,SOUK TNINE TAFINGOULT OULAD BERHIL,2019,51e1ed44dbd3e5bbd07f3ee7ff765b0f,...,"]1 , 3]",,"]1 , 3]",OUTADRART LAHOUCINE,Male,0.926715,Male,1.000000,Male,1.000000
3,77635319.0,2.133845e+12,7816_66,7816,66.0,1000214.0,MAHRAJ HASSAN,KISSARIAT AL MANIRA JNANE JAMAA,2019,f33bfe2dd61905596bf73def430cbc8b,...,,"]0 , 1]","]0 , 1]",MAHRAJ HASSAN,Male,0.989772,Male,1.000000,Male,1.000000
4,40259590.0,2.132185e+12,17570_NA,17570,,1000238.0,AMZIL AHMED,BLOC B N 56 HAY ASSALAM AIT IAAZA,2019,28e5e3012f5630820f4d6cae91e2c62c,...,,"]0 , 1]","]0 , 1]",AMZIL AHMED,Male,0.995692,Male,1.000000,Male,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143300,,,,,,9989908.0,IMAD II,PORT JORF LASFAR,2019,1bbd6209c522874cd1d6f0bf3510d665,...,,,,IMAD II,Male,0.991087,Male,1.000000,Male,0.589583
143301,,,,,,9989940.0,HAYAT,PORT JORF LASFAR,2019,4da5e0bed9b5eed1968f4d86bc352f9f,...,,,,HAYAT,Female,0.854567,Female,1.000000,Female,1.000000
143302,,,141792_NA,141792,,9990111.0,HARKIK ABDELLAH,SD SMAIL A DROITE,2019,76bbc6b5ce05f7efca1b85480df963c0,...,,,,HARKIK ABDELLAH,Male,0.941693,Male,1.000000,Male,0.912577
143303,,4.175900e+11,373288_NA,373288,,9992761.0,MR LAHBOUB AZZOUZ,BP 41139 AGENCE MEKDAD LAHRIZI DOUAR LAAYAYS...,2019,7dfd718ad3c1dba948448064816f5f52,...,,,,MR LAHBOUB AZZOUZ,Male,0.995887,Male,1.000000,Male,0.912303


In [154]:
dgi_19_new=pp_19_stock[['name','gender_NB','proba_predict_NB','gender_DT','proba_predict_DT','gender_RF','proba_predict_RF']]

# Sample prediction

In [156]:
dgi_19_new.to_excel(r"C:\Users\Jebbar Abdelkader\Desktop\pp_19_stock_new.xlsx")

In [131]:
dgi_2018_integers

Unnamed: 0.1,Unnamed: 0,IFU,NOM,PRENOMS,ICE,NUM_REGISTRE_COMMERCE,CODE_CENTRE_REGISTRE_COMMERCE,VILLE,NUM_CNSS,ADRESSE,...,CODE_FORME_JURIDIQUE,CHIFFRE_DAFFAIRE_FORFAITAIRE,CHIFFRE_DAFFAIRE_RNR,CODE_OPTION_IMPOT,gender_NB,proba_predict_NB,gender_DT,proba_predict_DT,gender_RF,proba_predict_RF
0,590,32515352,ghazouani med,123456789,1.658054e+12,,,OUEZZANE (M),,10 MOSQUE MED IV LAADIR,...,11,0,4000.00,,Male,0.665442,Male,1.0,Male,0.859028
1,1356,87875954,ahammar yahya,123456789,,,,NADOR (M),,3 IMM 44 RUE TOKYO,...,11,0,11116.66,,Male,0.664344,Female,1.0,Female,0.900278
2,1494,81900220,belaouchi siham,123456789,5.069450e+11,45600.0,61.0,TANGER,7509584.0,NOUINOUICH COM. BAHRAOUIYINE,...,11,0,1082421.65,,Female,0.801664,Male,1.0,Male,0.983333
3,1511,87853598,kantal morad,123456789,5.078520e+11,32802.0,49.0,NADOR (M),8086367.0,QY REGULARES,...,11,0,415377.23,,Male,0.798325,Male,1.0,Male,0.925000
4,3293,91705621,el hard rkia,123456789,,,,TEMARA (M),,1891 MASS I,...,11,0,0.00,,Male,0.623709,Female,1.0,Female,0.983333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,68345,72402010,boussiri allal,123456789,2.273042e+12,,,OUARZAZATE (M),,OULED BRAHIM AMZAOUROU SKOURA,...,11,0,236250.00,,Male,0.798325,Female,1.0,Female,1.000000
92,70810,87855093,toummouhi fatima,123456789,1.920506e+12,27192.0,49.0,NADOR (M),,QUARTIER ARRID PAR RTE DE TAOUIMA,...,11,0,1995975.50,,Female,0.969064,Female,1.0,Female,0.991667
93,70847,88216525,aarab nour-ddine,123456789,5.109920e+11,39166.0,49.0,BNI ANSAR (M),9060857.0,PORT DE BENI ENSAR,...,11,0,0.00,,Male,0.642725,Female,1.0,Male,0.548810
94,71667,38312740,zehouani mustapha,123456789,,,,KHEMISSET (M),,57 CHATEAU D'EAU,...,11,25200,0.00,,Male,0.748038,Female,1.0,Female,0.883333


In [132]:
#DGI_prenom.to_excel(r"C:\Users\Jebbar Abdelkader\Desktop\DGI_prenoms.xlsx")

In [133]:
#female[female['gender_NB']=='Male']

In [134]:
#DGI_prenom

In [135]:
#DGI_names=pd.read_excel(r"C:\Users\Jebbar Abdelkader\Desktop\DGI_prenoms.xlsx")

In [136]:
#Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
#Accuracy on training set
#print(dclf.score(dv.transform(dfX_train), dfy_train)) 
# Accuracy on test set
#print(dclf.score(dv.transform(dfX_test), dfy_test))

In [137]:
import numpy as np

In [138]:
DGI_names['genre']=np.where((DGI_names['gender_RF']==DGI_names['gender_DT'])&(DGI_names['gender_DT']==DGI_names['gender_NB']),DGI_names['gender_RF'],False)

NameError: name 'DGI_names' is not defined

In [677]:
DGI_names['proba']=np.where((DGI_names['gender_RF']==DGI_names['gender_DT'])&(DGI_names['gender_DT']==DGI_names['gender_NB']),DGI_names['proba_predict_NB'],False)

In [678]:
#np.where((DGI_names['gender_RF']==DGI_names['gender_NB'])&((DGI_names['gender_RF']==DGI_names['gender_DT'])),DGI_names['gender_RF'],False)

In [679]:
DGI_names.genre.value_counts()

Male      7695
False     4644
Female    3064
Name: genre, dtype: int64

In [685]:
DGI_names[DGI_names['genre']=='Female']

Unnamed: 0.1,Unnamed: 0,name,gender_NB,proba_predict_NB,gender_DT,proba_predict_DT,gender_RF,proba_predict_RF,genre,proba
3,3,sfai,Female,0.834777,Female,1.0,Female,1.000000,Female,0.834777
5,5,ijja,Female,0.858416,Female,1.0,Female,1.000000,Female,0.858416
8,8,arbia,Female,0.751951,Female,1.0,Female,1.000000,Female,0.751951
20,20,mahassine (osiris optic),Female,0.987661,Female,1.0,Female,0.793333,Female,0.987661
29,29,fany,Female,0.876137,Female,1.0,Female,1.000000,Female,0.876137
...,...,...,...,...,...,...,...,...,...,...
15380,15380,maazouza,Female,0.858416,Female,1.0,Female,1.000000,Female,0.858416
15384,15384,agazzara,Female,0.503841,Female,1.0,Female,0.910000,Female,0.503841
15387,15387,rada,Female,0.889915,Female,1.0,Female,1.000000,Female,0.889915
15395,15395,par ammadi latifa,Female,0.941481,Female,1.0,Female,0.790000,Female,0.941481


In [686]:
DGI_names[DGI_names['genre']==False]

Unnamed: 0.1,Unnamed: 0,name,gender_NB,proba_predict_NB,gender_DT,proba_predict_DT,gender_RF,proba_predict_RF,genre,proba
6,6,el boutahiri,Male,0.832567,Female,1.0,Male,0.810000,False,0.0
12,12,dan,Male,0.748038,Female,0.5,Male,0.529714,False,0.0
13,13,hamidi,Male,0.664344,Female,1.0,Male,0.760000,False,0.0
14,14,habboudi,Female,0.502609,Male,1.0,Male,0.950000,False,0.0
27,27,othmane,Female,0.502609,Male,1.0,Male,0.664583,False,0.0
...,...,...,...,...,...,...,...,...,...,...
15393,15393,sophie marie ghislain,Female,0.986376,Male,1.0,Male,0.790000,False,0.0
15394,15394,ettoubi,Female,0.503841,Female,1.0,Male,0.710000,False,0.0
15397,15397,aicha heritiers,Female,0.875601,Male,1.0,Male,0.865000,False,0.0
15401,15401,idrisse,Male,0.664344,Female,1.0,Female,0.619000,False,0.0


In [687]:
DGI_names.to_excel(r"C:\Users\Jebbar Abdelkader\Desktop\DGI_names.xlsx")    