## Loading Datasets

In [41]:
import pandas as pd 
import numpy as np
import gender_guesser.detector as gender

In [42]:
real_users = pd.read_csv('datasets/realusers.csv')
fake_users = pd.read_csv('datasets/fakeusers.csv')

In [43]:
print(real_users.shape)
print(fake_users.shape)

(1481, 41)
(1337, 41)


## Data Preprocessing 
#### Concatenate both datasets

In [44]:
# Concatenate(Join) both datasets into a single dataset
x = pd.concat([real_users, fake_users])
x

Unnamed: 0.1,Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,verified,description,updated,dataset,age_in_days,ratio statuses_count/age,ratio Favorites/age,ratio Friends/Followers,length_of_bio,reputation
0,0,3610511,Davide Dellacasa,braddd,20370,5470,2385,145,52,2007-04-06 10:58:22+00:00,...,,Founder of http://www.screenweek.it & http://w...,2015-02-14 10:54:49,E13,5822,3.499,0.025,0.436,151,0.696372
1,1,5656162,Simone Economo,eKoeS,3131,506,381,9,40,2007-04-30 15:08:42+00:00,...,,BSc degree (cum laude) in Computer Engineering...,2015-02-14 10:54:49,E13,5797,0.540,0.002,0.753,104,0.570462
2,2,5682702,tacone,tacone_,4024,264,87,323,16,2007-05-01 11:53:40+00:00,...,,Cogito ergo bestemmio.,2015-02-14 10:54:49,E13,5797,0.694,0.056,0.330,22,0.752137
3,3,6067292,alesaura,alesstar,40586,640,622,1118,32,2007-05-15 16:55:16+00:00,...,,"Se la vita ti dà sarde, scapocciale!",2015-02-14 10:54:49,E13,5782,7.019,0.193,0.972,36,0.507132
4,4,6015122,Angelo,PerDiletto,2016,62,64,13,0,2007-05-13 19:52:00+00:00,...,,Je me souviens,2015-02-14 10:54:49,E13,5784,0.349,0.002,1.032,14,0.492063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332,1332,1391497074,Verda Marks,VerdaMarks1,1,0,17,0,0,2013-04-30 08:23:57+00:00,...,,I have been in business onlin and offline for ...,2015-02-14 10:40:01,INT,3606,0.000,0.000,inf,158,0.000000
1333,1333,1391544607,Danial Campbell,DanialCampbell2,0,1,17,0,0,2013-04-30 08:34:49+00:00,...,,,2015-02-14 10:40:01,INT,3606,0.000,0.000,17.000,3,0.055556
1334,1334,1391622127,Maudie Meyer,MaudieMeyer1,2,0,15,0,0,2013-04-30 09:21:12+00:00,...,,,2015-02-14 10:40:01,INT,3606,0.001,0.000,inf,3,0.000000
1335,1335,1391832212,Harriett Harvey,HarriettHarvey9,2,0,16,0,0,2013-04-30 11:25:11+00:00,...,,,2015-02-14 10:40:01,INT,3606,0.001,0.000,inf,3,0.000000


In [45]:
df = pd.DataFrame(x)

In [46]:
# Assigning False '0' to fake_users list and true '1' to real_users list
y = len(fake_users)*[0]+len(real_users)*[1]

#### Gender Detection

In [54]:
# Create a detector instance
sex_predictor = gender.Detector(case_sensitive=False)

# Extract the first name and predict their genders
df['First Name'] = df['name'].str.split(' ').str.get(0)
df['Predicted Sex'] = df['First Name'].apply(sex_predictor.get_gender)

# Mapping of Gender
sex_dict = {'female': -2, 'mostly_female': -1, 'unknown': 0, 'mostly_male': 1, 'male': 2}

# Handle 'unknown' values
df['Predicted Sex'] = df['Predicted Sex'].apply(lambda x: 'unknown' if x == 'andy' else x)

# Map the predicted genders to codes
df['Sex Code'] = df['Predicted Sex'].map(sex_dict).astype(int)

In [56]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,...,dataset,age_in_days,ratio statuses_count/age,ratio Favorites/age,ratio Friends/Followers,length_of_bio,reputation,First Name,Predicted Sex,Sex Code
443,443,215216086,psicoblogger.it,megstudio,1200,4878,5335,2899,13,2010-11-13 09:18:50+00:00,...,E13,4505,0.266,0.644,1.094,61,0.477627,psicoblogger.it,unknown,0
1024,1024,619456829,Hilma Brock,hilmabrockhdz,26,10,237,0,0,2012-06-26 22:02:46+00:00,...,INT,3913,0.007,0.0,23.7,124,0.040486,Hilma,female,-2
216,216,616228429,Edmund Sargent,edmundsarg,63,20,572,0,0,2012-06-23 15:53:00+00:00,...,INT,3916,0.016,0.0,28.6,3,0.033784,Edmund,male,2
1119,1119,578916985,Matte Granata,MattewGra,68,31,95,44,0,2012-05-13 12:24:59+00:00,...,E13,3958,0.017,0.011,3.065,79,0.246032,Matte,unknown,0
588,588,319091474,Daniele Colombo ,ColomboDaniele,293,42,139,0,0,2011-06-17 15:04:47+00:00,...,E13,4288,0.068,0.0,3.31,47,0.232044,Daniele,female,-2
