# DAT550 Project
**Robin Liebert, Armin Sabri, Kathir Tahasin, Elisabeth Eik** <br>
**15.05.2022**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

Read the data sets we have for the project:

In [2]:
sites = pd.read_csv('../data/all-partisan-sites.csv')
domain = pd.read_csv('../data/domaintools-whois-results.csv')
page_info = pd.read_csv('../data/pages-info.csv')

## SITES

In [3]:
sites['site'] = sites['site'].str.lower()
sites.head()

Unnamed: 0,site,political_category,fb_id,unavailable_id,macedonian
0,100percentfedup.com,right,311000000000000.0,,0
1,21stcenturywire.com,left,182000000000000.0,,0
2,24dailynew.com,right,516000000000000.0,,1
3,24usnews.com,right,1430000000000000.0,,1
4,4threvolutionarywar.wordpress.com,left,,,0


In [4]:
np.shape(sites)

(677, 5)

In [5]:
miss_val = (sites.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
unavailable_id,608
fb_id,118


Dropping unavailable_id as it has a lot of missing values. 
fb_id is the same as page_id in the page_info dataframe, so we will remove this before we combine the dataframes later. 

In [6]:
sites = sites.drop(['unavailable_id', 'fb_id'], axis = 1)

In [7]:
sites.head()

Unnamed: 0,site,political_category,macedonian
0,100percentfedup.com,right,0
1,21stcenturywire.com,left,0
2,24dailynew.com,right,1
3,24usnews.com,right,1
4,4threvolutionarywar.wordpress.com,left,0


In [8]:
sites.to_csv('sites.csv')

## DOMAIN

In [9]:
domain['domain'] = domain['domain'].str.lower()
domain.head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact fax,admin contact email 1,admin contact email 2,admin contact email 3,billing contact name,billing contact org,billing contact street,billing contact city,billing contact state,billing contact postal,billing contact country,billing contact phone,billing contact fax,billing contact email 1,billing contact email 2,billing contact email 3,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact fax,registrant contact email 1,registrant contact email 2,registrant contact email 3,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact fax,technical contact email 1,technical contact email 2,technical contact email 3,create date,expiration date,additional whois email 1,additional whois email 2,additional whois email 3,name server 1 - host,name server 2 - host,name server 3 - host,name server 4 - host,name server 5 - host,name server 6 - host,name server 7 - host,name server 8 - host,registrar,registrar status 1,registrar status 2,registrar status 3,registrar status 4,registrar status 5,registrar status 6
0,100percentfedup.com,https://whois.domaintools.com/100percentfedup.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,14806240000.0,100percentfedup.com@domainsbyproxy.com,,,,,,,,,,,,,,,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,14806240000.0,100percentfedup.com@domainsbyproxy.com,,,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,14806240000.0,100percentfedup.com@domainsbyproxy.com,,,13/03/2012,29/09/2020,abuse@godaddy.com,,,ed.ns.cloudflare.com,gina.ns.cloudflare.com,,,,,,,GO DADDY SOFTWARE INC,clientTransferProhibited,,,,,
1,21stcenturywire.com,https://whois.domaintools.com/21stcenturywire.com,P. Henningsen,,"Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,,pj.henningsen@gmail.com,,,,,,,,,,,,,,,P. Henningsen,,"Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,,pj.henningsen@gmail.com,,,P. Henningsen,,"Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,,pj.henningsen@gmail.com,,,03/11/2009,03/11/2017,abuse@godaddy.com,,,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,,,,,,,GO DADDY SOFTWARE INC,clientTransferProhibited,,,,,
2,24dailynew.com,https://whois.domaintools.com/24dailynew.com,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,,contact@privacyprotect.org,,,,,,,,,,,,,,,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,,contact@privacyprotect.org,,,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,,contact@privacyprotect.org,,,21/02/2017,21/02/2018,abuse-contact@publicdomainregistry.com,,,ns201.ehosts.com,ns202.ehosts.com,,,,,,,PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM,clientTransferProhibited,,,,,
3,24usnews.com,https://whois.domaintools.com/24usnews.com,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,,24usnewss@gmail.com,,,,,,,,,,,,,,,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,,24usnewss@gmail.com,,,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,,24usnewss@gmail.com,,,03/07/2016,03/07/2018,domainabuse@tucows.com,tucows@domains.siteground.com,,ns1.renewyourname.net,ns2.renewyourname.net,,,,,,,TUCOWS DOMAINS INC,clientTransferProhibited,,,,,
4,63red.com,https://whois.domaintools.com/63red.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,14259740000.0,tcwrnqtdq@whoisprivacyprotect.com,,,,,,,,,,,,,,,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,14259740000.0,tcwrnqtdq@whoisprivacyprotect.com,,,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,14259740000.0,tcwrnqtdq@whoisprivacyprotect.com,,,05/12/2011,05/12/2017,abuse@enom.com,,,ns1.digitalocean.com,ns2.digitalocean.com,ns3.digitalocean.com,,,,,,"ENOM, INC",clientTransferProhibited,,,,,


In [10]:
np.shape(domain)

(663, 70)

In [11]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
registrar status 6,663
admin contact email 3,663
registrar status 5,663
billing contact email 3,663
billing contact email 2,663


In this dataframe we have a lot of missing values. To begin with we can remove the columns which have more than 80% missing values

In [12]:
threshold = 20.0 
min_count =  int(((100-threshold)/100)*domain.shape[0] + 1)
domain = domain.dropna(axis=1, thresh=min_count) #Dropping columns which have a nan pervcentage higher than 80%. 

Removing the rows which have more than 50% null values. 

In [13]:
threshold = 50.0 
min_count =  int(((100-threshold)/100)*domain.shape[1] + 1)
domain = domain.dropna(axis=0, thresh=min_count) #Dropping columns which have a nan pervcentage higher than 80%. 

Now that we have removed the columns with the most NaN values, we will have a look if there are any columns we do not need. 

In [14]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
registrant contact org,109
admin contact org,108
technical contact org,98
additional whois email 1,92
registrant contact state,22


The three next columns which have the most nan values are admin contact org, registrant contact org and technical contact org. We'll check the most common values in these columns. 

In [15]:
domain['admin contact org'].value_counts(dropna = False, ascending = False).head()

Domains By Proxy, LLC         199
NaN                           108
WHOISGUARD, INC                27
PROXY PROTECTION LLC           21
Discount Book Distributors     13
Name: admin contact org, dtype: int64

In [16]:
domain['registrant contact org'].value_counts(dropna = False, ascending = False).head()

Domains By Proxy, LLC    199
NaN                      109
WHOISGUARD, INC           27
PROXY PROTECTION LLC      21
WhoisGuard, Inc           13
Name: registrant contact org, dtype: int64

In [17]:
domain['technical contact org'].value_counts(dropna = False, ascending = False).head()

Domains By Proxy, LLC    199
NaN                       98
WHOISGUARD, INC           27
PROXY PROTECTION LLC      21
1&1 Internet Inc          16
Name: technical contact org, dtype: int64

Domains By Proxy, LLC is the most common for all columns, so we will fill the nan values with this. 

In [18]:
domain['admin contact org'].fillna('Domains By Proxy, LLC', inplace = True)

In [19]:
domain['registrant contact org'].fillna('Domains By Proxy, LLC', inplace = True)

In [20]:
domain['technical contact org'].fillna('Domains By Proxy, LLC', inplace = True)

In [21]:
domain['additional whois email 1'].value_counts(dropna = False, ascending = False).head()

abuse@godaddy.com             286
NaN                            92
abuse@enom.com                 61
abuse@web.com                  34
domain-abuse@dreamhost.com     21
Name: additional whois email 1, dtype: int64

In [22]:
domain[domain['additional whois email 1'] == 'abuse@godaddy.com'].head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,additional whois email 1,name server 1 - host,name server 2 - host,registrar,registrar status 1
0,100percentfedup.com,https://whois.domaintools.com/100percentfedup.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,13/03/2012,29/09/2020,abuse@godaddy.com,ed.ns.cloudflare.com,gina.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
1,21stcenturywire.com,https://whois.domaintools.com/21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,abuse@godaddy.com,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
6,abeldanger.net,https://whois.domaintools.com/abeldanger.net,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,abeldanger.net@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,abeldanger.net@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,abeldanger.net@domainsbyproxy.com,12/05/2010,12/05/2018,abuse@godaddy.com,ns21.domaincontrol.com,ns22.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited
10,activistpost.com,https://whois.domaintools.com/activistpost.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,activistpost.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,activistpost.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,activistpost.com@domainsbyproxy.com,23/04/2010,23/04/2018,abuse@godaddy.com,ns57.domaincontrol.com,ns58.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited
18,alexanderhiggins.com,https://whois.domaintools.com/alexanderhiggins...,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,alexanderhiggins.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,alexanderhiggins.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,alexanderhiggins.com@domainsbyproxy.com,11/03/2006,11/03/2023,abuse@godaddy.com,ns15.domaincontrol.com,ns16.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited


In [23]:
domain[domain['additional whois email 1'] == 'abuse@enom.com'].head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,additional whois email 1,name server 1 - host,name server 2 - host,registrar,registrar status 1
4,63red.com,https://whois.domaintools.com/63red.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,05/12/2011,05/12/2017,abuse@enom.com,ns1.digitalocean.com,ns2.digitalocean.com,"ENOM, INC",clientTransferProhibited
7,abncnews.com,https://whois.domaintools.com/abncnews.com,WHOISGUARD PROTECTED,"WHOISGUARD, INC",P.O. BOX 0823-03411,PANAMA,PANAMA,0,pa,5078366000.0,0f5b9336dfad49918cebaa3c8a809a70.protect@whois...,WHOISGUARD PROTECTED,"WHOISGUARD, INC",P.O. BOX 0823-03411,PANAMA,PANAMA,0,pa,5078366000.0,0f5b9336dfad49918cebaa3c8a809a70.protect@whois...,WHOISGUARD PROTECTED,"WHOISGUARD, INC",P.O. BOX 0823-03411,PANAMA,PANAMA,0,pa,5078366000.0,0f5b9336dfad49918cebaa3c8a809a70.protect@whois...,03/11/2015,03/11/2017,abuse@enom.com,dawn.ns.cloudflare.com,todd.ns.cloudflare.com,"ENOM, INC",clientTransferProhibited
9,act.tv,https://whois.domaintools.com/act.tv,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,31/07/2003,31/07/2017,abuse@enom.com,barbara.ns.cloudflare.com,brad.ns.cloudflare.com,"ENOM, INC",clientTransferProhibited
13,afr.net,https://whois.domaintools.com/afr.net,BUDDY SMITH,"AMERICAN FAMILY ASSOCIATION, INC","107 PARKGATE DRIVE,P.O. DRAWER 2440",TUPELO,MS,38803,us,16628450000.0,billing@optimusmedia.com,BUDDY SMITH,"AMERICAN FAMILY ASSOCIATION, INC","107 PARKGATE DRIVE,P.O. DRAWER 2440",TUPELO,MS,38803,us,16628450000.0,billing@optimusmedia.com,BUDDY SMITH,"AMERICAN FAMILY ASSOCIATION, INC","107 PARKGATE DRIVE,P.O. DRAWER 2440",TUPELO,MS,38803,us,16628450000.0,billing@optimusmedia.com,04/03/1998,03/03/2018,abuse@enom.com,dns1.name-services.com,dns2.name-services.com,"ENOM, INC",clientTransferProhibited
34,americannewsx.com,https://whois.domaintools.com/americannewsx.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O AMERICANNEWSX.COM",KIRKLAND,WA,98083,us,14252740000.0,qyysnttm@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O AMERICANNEWSX.COM",KIRKLAND,WA,98083,us,14252740000.0,qyysnttm@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O AMERICANNEWSX.COM",KIRKLAND,WA,98083,us,14252740000.0,qyysnttm@whoisprivacyprotect.com,10/09/2014,10/09/2017,abuse@enom.com,duke.ns.cloudflare.com,mia.ns.cloudflare.com,"ENOM, INC",clientTransferProhibited


In [24]:
domain[domain['additional whois email 1'] == 'abuse@web.com'].head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,additional whois email 1,name server 1 - host,name server 2 - host,registrar,registrar status 1
47,amren.com,https://whois.domaintools.com/amren.com,"Taylor, Samuel",New Century Foundation,PO Box 527,Oakton,VA,22124,us,17037160000.0,contactus@amren.com,New Century Foundation,New Century Foundation,PO BOX 527,OAKTON,VA,22124-0527,us,17037160000.0,contactus@amren.com,"Taylor, Samuel",New Century Foundation,PO Box 527,Oakton,VA,22124,us,17037160000.0,contactus@amren.com,11/08/1995,10/08/2021,abuse@web.com,ernest.ns.cloudflare.com,pola.ns.cloudflare.com,"NETWORK SOLUTIONS, LLC",clientTransferProhibited
53,anncoulter.com,https://whois.domaintools.com/anncoulter.com,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,xq5rq2fu4za@networksolutionsprivateregistratio...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,xq5rq2fu4za@networksolutionsprivateregistratio...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,xq5rq2fu4za@networksolutionsprivateregistratio...,27/01/1999,27/01/2023,abuse@web.com,ns-1461.awsdns-54.org,ns-157.awsdns-19.com,"NETWORK SOLUTIONS, LLC",clientTransferProhibited
60,bearingarms.com,https://whois.domaintools.com/bearingarms.com,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,rx3643rr8m8@networksolutionsprivateregistratio...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,rx3643rr8m8@networksolutionsprivateregistratio...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Parkway West,Jacksonville,FL,32258,us,15707090000.0,rx3643rr8m8@networksolutionsprivateregistratio...,18/04/2008,18/04/2018,abuse@web.com,jack.ns.cloudflare.com,pat.ns.cloudflare.com,"NETWORK SOLUTIONS, LLC",clientTransferProhibited
71,billmoyers.com,https://whois.domaintools.com/billmoyers.com,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Pkwy West,Jacksonville,FL,32258,us,19027490000.0,6734e88e0a28fd0a496fc346bd693879@domaindiscree...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Pkwy West,Jacksonville,FL,32258,us,19027490000.0,6734e8960a28fd0a57528c293b95bf75@domaindiscree...,"PERFECT PRIVACY, LLC","Domains By Proxy, LLC",12808 Gran Bay Pkwy West,Jacksonville,FL,32258,us,19027490000.0,6734e8970a28fd0a6c7004882e5bc77a@domaindiscree...,03/07/1997,02/07/2020,abuse@web.com,dns118.a.register.com,dns159.b.register.com,"REGISTER.COM, INC",ok
121,conservativepressnetwork.com,https://whois.domaintools.com/conservativepres...,"New Ventures Services, Corp","New Ventures Services, Corp",PO BOX 459 Drums,Drums,PA,18222,us,18558970000.0,admin@newvcorp.com,"New Ventures Services, Corp","New Ventures Services, Corp",PO BOX 459 Drums,Drums,PA,18222,us,18558970000.0,admin@newvcorp.com,"New Ventures Services, Corp","New Ventures Services, Corp",PO BOX 459 Drums,Drums,PA,18222,us,18558970000.0,admin@newvcorp.com,17/03/2017,17/03/2018,abuse@web.com,ns1626.ztomy.com,ns2626.ztomy.com,NAMEPAL.COM #8001,ok


In [25]:
domain[domain['additional whois email 1'] == 'domain-abuse@dreamhost.com'].head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,additional whois email 1,name server 1 - host,name server 2 - host,registrar,registrar status 1
89,borntoberight.com,https://whois.domaintools.com/borntoberight.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O BORNTOBERIGHT.COM",BREA,CA,92821,us,17147060000.0,cjhmxwgd6xybatj@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O BORNTOBERIGHT.COM",BREA,CA,92821,us,17147060000.0,cjhmxwgd6xybatj@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O BORNTOBERIGHT.COM",BREA,CA,92821,us,17147060000.0,cjhmxwgd6xybatj@proxy.dreamhost.com,05/04/2017,05/04/2018,domain-abuse@dreamhost.com,ns1.dreamhost.com,ns2.dreamhost.com,"DREAMHOST, LLC",clientTransferProhibited
94,chicksontheright.com,https://whois.domaintools.com/chicksontheright...,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CHICKSONTHERIGHT.COM",BREA,CA,92821,us,17147060000.0,chicksontheright.com@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CHICKSONTHERIGHT.COM",BREA,CA,92821,us,17147060000.0,chicksontheright.com@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CHICKSONTHERIGHT.COM",BREA,CA,92821,us,17147060000.0,chicksontheright.com@proxy.dreamhost.com,18/02/2009,18/02/2018,domain-abuse@dreamhost.com,chan.ns.cloudflare.com,vern.ns.cloudflare.com,"DREAMHOST, LLC",clientTransferProhibited
105,consamerica.com,https://whois.domaintools.com/consamerica.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSAMERICA.COM",BREA,CA,92821,us,17147060000.0,3myn73b7ctgestn@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSAMERICA.COM",BREA,CA,92821,us,17147060000.0,3myn73b7ctgestn@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSAMERICA.COM",BREA,CA,92821,us,17147060000.0,3myn73b7ctgestn@proxy.dreamhost.com,10/07/2016,10/07/2018,domain-abuse@dreamhost.com,aragorn.ns.cloudflare.com,mira.ns.cloudflare.com,"DREAMHOST, LLC",ok
125,conservativeshere.com,https://whois.domaintools.com/conservativesher...,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSERVATIVESHERE.COM",BREA,CA,92821,us,17147060000.0,tg2bkzzvdrkn3ul@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSERVATIVESHERE.COM",BREA,CA,92821,us,17147060000.0,tg2bkzzvdrkn3ul@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O CONSERVATIVESHERE.COM",BREA,CA,92821,us,17147060000.0,tg2bkzzvdrkn3ul@proxy.dreamhost.com,16/03/2017,16/03/2018,domain-abuse@dreamhost.com,ns1.dreamhost.com,ns2.dreamhost.com,"DREAMHOST, LLC",clientTransferProhibited
198,enhlive.com,https://whois.domaintools.com/enhlive.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O ENHLIVE.COM",BREA,CA,92821,us,17147060000.0,ul5ftpgj77cpn9g@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O ENHLIVE.COM",BREA,CA,92821,us,17147060000.0,ul5ftpgj77cpn9g@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O ENHLIVE.COM",BREA,CA,92821,us,17147060000.0,ul5ftpgj77cpn9g@proxy.dreamhost.com,20/11/2016,20/11/2017,domain-abuse@dreamhost.com,ns1.dreamhost.com,ns2.dreamhost.com,"DREAMHOST, LLC",clientTransferProhibited


For additional whois email 1, we can see that it has a correlation with the column registrar when checking each value for the email. The additional whois email 1 will not give any more information than registrar, so we can remove this. 

In [26]:
domain = domain.drop(columns = 'additional whois email 1')

In [27]:
domain['registrant contact state'].value_counts(dropna = False, ascending = False).head()

Arizona    202
CA          43
PANAMA      27
FL          25
NaN         22
Name: registrant contact state, dtype: int64

We can see Arizona is the most common, but let's have a look at the rows where 'registrant contact state' is NaN.

In [28]:
domain[domain['registrant contact state'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
1,21stcenturywire.com,https://whois.domaintools.com/21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
56,arcmedia.org,https://whois.domaintools.com/arcmedia.org,Master Host,One.com,Kalvebod Brygge 24,Copenhagen V,Copenhagen V,1560,dk,4546907000.0,hostmaster@one.com,John Stutsman,"Domains By Proxy, LLC",8280 High Point Drive,Evansville,,47630,us,18124900000.0,john@arcmedia.org,Master Host,One.com,Kalvebod Brygge 24,Copenhagen V,Copenhagen V,1560,dk,4546907000.0,hostmaster@one.com,23/01/2017,23/01/2018,duke.ns.cloudflare.com,sue.ns.cloudflare.com,"ASCIO TECHNOLOGIES, INC. DANMARK ? FILIAL AF A...",ok
66,bigbluedimension.com,https://whois.domaintools.com/bigbluedimension...,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,17/05/2016,17/05/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited
67,bigbluevision.org,https://whois.domaintools.com/bigbluevision.org,Tatjana Karakachanova,"Domains By Proxy, LLC",BELGRADSKA 27,Veles,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,Tatjana Karakachanova,"Domains By Proxy, LLC",BELGRADSKA 27,Veles,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,Bluehost Inc,Bluehost.com,1958 South 950 East,Provo,Utah,84606,us,18017660000.0,whois@bluehost.com,25/07/2016,25/07/2017,jasmine.ns.cloudflare.com,ken.ns.cloudflare.com,"FASTDOMAIN, INC",clientTransferProhibited
78,blingnews.com,https://whois.domaintools.com/blingnews.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,13/08/2016,13/08/2017,ns3.tmads.com,ns4.tmads.com,GO DADDY SOFTWARE INC,clientTransferProhibited
93,c4ss.org,https://whois.domaintools.com/c4ss.org,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,06/04/2005,06/04/2019,a.dns.gandi.net,b.dns.gandi.net,GANDI,clientTransferProhibited
137,cosmo-politics.com,https://whois.domaintools.com/cosmo-politics.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,04/02/2016,04/02/2018,dns1.namecheaphosting.com,dns2.namecheaphosting.com,GO DADDY SOFTWARE INC,clientTransferProhibited
155,dailynews24-7.com,https://whois.domaintools.com/dailynews24-7.com,ANGEL JAKIMOVSKI,"Domains By Proxy, LLC",SRBO TOMOVIC 36 1/13,KUMANOVO,,1300,mk,38975450000.0,angel_jakimovski@yahoo.com,ANGEL JAKIMOVSKI,"Domains By Proxy, LLC",SRBO TOMOVIC 36 1/13,KUMANOVO,,1300,mk,38975450000.0,angel_jakimovski@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,23/03/2016,23/03/2017,ns1.bluehost-expired.domainparkingserver.net,ns2.bluehost-expired.domainparkingserver.net,"FASTDOMAIN, INC",clientTransferProhibited
162,dailysidnews.com,https://whois.domaintools.com/dailysidnews.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,26/01/2017,10/03/2018,ns45.domaincontrol.com,ns46.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited
181,departed.co,https://whois.domaintools.com/departed.co,Bernard Smith,"Domains By Proxy, LLC",Address,City,,119,ge,9951231000.0,apatche777@yahoo.com,Bernanrd Smith,"Domains By Proxy, LLC",Street,City,,119,ge,9951231000.0,apatche777@yahoo.com,Hostmaster ONEANDONE,1&1 Internet Inc,"701 Lee Rd.,Suite 300",Chesterbrook,PA,19087,us,18774610000.0,hostmaster@1and1.com,22/02/2016,21/02/2018,ns-us.1and1-dns.com,ns-us.1and1-dns.de,1&1 INTERNET SE,clientTransferProhibited


The reason why 'registrant contact state' is NaN in some rows, is because there is no states in the country for those rows. 
Since there is no state here, we will just fill it with 0. 

In [29]:
domain['registrant contact state'].fillna(0, inplace = True)

For admin contact state, it is probably the same reason as with 'registrant contact state' why we have nan values:

In [30]:
domain[domain['admin contact state'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
1,21stcenturywire.com,https://whois.domaintools.com/21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
66,bigbluedimension.com,https://whois.domaintools.com/bigbluedimension...,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,17/05/2016,17/05/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited
67,bigbluevision.org,https://whois.domaintools.com/bigbluevision.org,Tatjana Karakachanova,"Domains By Proxy, LLC",BELGRADSKA 27,Veles,,1400,mk,389078000000.0,stojanov_tose@yahoo.com,Tatjana Karakachanova,"Domains By Proxy, LLC",BELGRADSKA 27,Veles,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,Bluehost Inc,Bluehost.com,1958 South 950 East,Provo,Utah,84606,us,18017660000.0,whois@bluehost.com,25/07/2016,25/07/2017,jasmine.ns.cloudflare.com,ken.ns.cloudflare.com,"FASTDOMAIN, INC",clientTransferProhibited
78,blingnews.com,https://whois.domaintools.com/blingnews.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,0,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,13/08/2016,13/08/2017,ns3.tmads.com,ns4.tmads.com,GO DADDY SOFTWARE INC,clientTransferProhibited
93,c4ss.org,https://whois.domaintools.com/c4ss.org,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,0,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,06/04/2005,06/04/2019,a.dns.gandi.net,b.dns.gandi.net,GANDI,clientTransferProhibited
137,cosmo-politics.com,https://whois.domaintools.com/cosmo-politics.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,0,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,04/02/2016,04/02/2018,dns1.namecheaphosting.com,dns2.namecheaphosting.com,GO DADDY SOFTWARE INC,clientTransferProhibited
155,dailynews24-7.com,https://whois.domaintools.com/dailynews24-7.com,ANGEL JAKIMOVSKI,"Domains By Proxy, LLC",SRBO TOMOVIC 36 1/13,KUMANOVO,,1300,mk,38975450000.0,angel_jakimovski@yahoo.com,ANGEL JAKIMOVSKI,"Domains By Proxy, LLC",SRBO TOMOVIC 36 1/13,KUMANOVO,0,1300,mk,38975450000.0,angel_jakimovski@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,23/03/2016,23/03/2017,ns1.bluehost-expired.domainparkingserver.net,ns2.bluehost-expired.domainparkingserver.net,"FASTDOMAIN, INC",clientTransferProhibited
162,dailysidnews.com,https://whois.domaintools.com/dailysidnews.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,0,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,26/01/2017,10/03/2018,ns45.domaincontrol.com,ns46.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited
181,departed.co,https://whois.domaintools.com/departed.co,Bernard Smith,"Domains By Proxy, LLC",Address,City,,119,ge,9951231000.0,apatche777@yahoo.com,Bernanrd Smith,"Domains By Proxy, LLC",Street,City,0,119,ge,9951231000.0,apatche777@yahoo.com,Hostmaster ONEANDONE,1&1 Internet Inc,"701 Lee Rd.,Suite 300",Chesterbrook,PA,19087,us,18774610000.0,hostmaster@1and1.com,22/02/2016,21/02/2018,ns-us.1and1-dns.com,ns-us.1and1-dns.de,1&1 INTERNET SE,clientTransferProhibited
231,globalinfotoday.com,https://whois.domaintools.com/globalinfotoday.com,Dajana Petrovska,"Domains By Proxy, LLC",Blagoj Gjorev 89/2-1,Veles,,1400,mk,38976380000.0,viktortonevski12@gmail.com,Dajana Petrovska,"Domains By Proxy, LLC",Blagoj Gjorev 89/2-1,Veles,0,1400,mk,38976380000.0,viktortonevski12@gmail.com,Hostmaster ONEANDONE,1&1 Internet Inc,"701 Lee Rd.,Suite 300",Chesterbrook,PA,19087,us,18774610000.0,hostmaster@1and1.com,26/09/2016,26/09/2017,ns1.usasportsfields.com,ns2.usasportsfields.com,1&1 INTERNET SE,clientTransferProhibited


For this column, we will fill the nan values with 0. 

In [31]:
domain['admin contact state'].fillna(0, inplace = True)

Same with technical contact state:

In [32]:
domain[domain['technical contact state'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
1,21stcenturywire.com,https://whois.domaintools.com/21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
78,blingnews.com,https://whois.domaintools.com/blingnews.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,0,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,0,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,Todor Spasev,"Domains By Proxy, LLC",Dimitar Vlahov 64,Veles,,1400,mk,38978280000.0,healthyhappy247hh@gmail.com,13/08/2016,13/08/2017,ns3.tmads.com,ns4.tmads.com,GO DADDY SOFTWARE INC,clientTransferProhibited
93,c4ss.org,https://whois.domaintools.com/c4ss.org,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,0,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,0,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,William Gillis,Center for a Stateless Society,"The Center for a Stateless Society,P.O. Box 47...",Tulsa,,74133,us,15033880000.0,263d090978333e1b915e69b23537a175-3375344@conta...,06/04/2005,06/04/2019,a.dns.gandi.net,b.dns.gandi.net,GANDI,clientTransferProhibited
137,cosmo-politics.com,https://whois.domaintools.com/cosmo-politics.com,Gjorgi Janev,na,na,na,0,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,0,na,mk,38978430000.0,gjorgji.janev1@gmail.com,Gjorgi Janev,na,na,na,,na,mk,38978430000.0,gjorgji.janev1@gmail.com,04/02/2016,04/02/2018,dns1.namecheaphosting.com,dns2.namecheaphosting.com,GO DADDY SOFTWARE INC,clientTransferProhibited
162,dailysidnews.com,https://whois.domaintools.com/dailysidnews.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,0,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,0,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,Filip Ivanov,"Domains By Proxy, LLC",Hristo Tatarchev 16/2-4,Veles,,1400,mk,38978270000.0,ivantrajchev96@hotmail.com,26/01/2017,10/03/2018,ns45.domaincontrol.com,ns46.domaincontrol.com,GO DADDY SOFTWARE INC,clientTransferProhibited
276,knoxreport.com,https://whois.domaintools.com/knoxreport.com,Nikki Venter,Nikki Venter,"Private Bag X1, Suite 276 Melkbosstrand",Cape Town,0,7437,za,27725850000.0,niks.venter@gmail.com,Nikki Venter,Nikki Venter,"Private Bag X1, Suite 276 Melkbosstrand",Cape Town,0,7437,za,27725850000.0,niks.venter@gmail.com,Domain Administrator,Hetzner (Pty) Ltd,P.O. Box 3450,Durbanville,,7551,za,27219700000.0,info@hetzner.co.za,04/11/2015,04/11/2017,ns1.dns-h.com,ns1.host-h.net,TUCOWS DOMAINS INC,clientTransferProhibited
348,newsflashonline.info,https://whois.domaintools.com/newsflashonline....,Aleksandar Ristevski,GatorTips,Franc Rozman 107,Kumanovo,0,1300,mk,38978440000.0,aleksandarcako9@gmail.com,Aleksandar Ristevski,GatorTips,Franc Rozman 107,Kumanovo,0,1300,mk,38978440000.0,aleksandarcako9@gmail.com,Aleksandar Ristevski,GatorTips,Franc Rozman 107,Kumanovo,,1300,mk,38978440000.0,aleksandarcako9@gmail.com,08/02/2017,08/02/2018,ns1.focusnews.us,ns2.focusnews.us,,clientDeleteProhibited
527,thenewamerican.com,https://whois.domaintools.com/thenewamerican.com,Brian Witt,"Domains By Proxy, LLC",The John Birch Society PO Box 8040 770 Westhil...,,0,,us,9207494000.0,webmaster@jbs.org,,PO Box 8040,770 Westhill Blvd,Appleton,Wisconsin,54912,us,,webmaster@jbs.org,Brian Witt,"Domains By Proxy, LLC",The John Birch Society Post Office Box 8040 77...,,,,us,9207494000.0,webmaster@jbs.org,15/04/1999,15/04/2018,dns4.rr.com,ns1.biz.rr.com,GO DADDY SOFTWARE INC,clientTransferProhibited
567,trueactivist.com,https://whois.domaintools.com/trueactivist.com,On behalf of trueactivist.com,c/o IDPS International Domain Privacy Services...,Hansaallee 191,Duesseldorf,0,40549,de,4921190000000.0,jkbm728uzzx8fgscrk3nr5mna9wx4vff@proxy-privacy...,On behalf of trueactivist.com,c/o IDPS International Domain Privacy Services...,Hansaallee 191,Duesseldorf,0,40549,de,4921190000000.0,60pmzbo7hh7ev0s60h0j2tqxnk9mv413@proxy-privacy...,On behalf of trueactivist.com,c/o IDPS International Domain Privacy Services...,Hansaallee 191,Duesseldorf,,40549,de,4921190000000.0,jkbm728uzzx8fgscrk3nr5mna9wx4vff@proxy-privacy...,07/09/2011,07/09/2018,bella.ns.cloudflare.com,dom.ns.cloudflare.com,JOKER.COM,clientTransferProhibited


Also fill this with 0. 

In [33]:
domain['technical contact state'].fillna(0, inplace = True)

Checking missing values again:

In [34]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
technical contact postal,14
admin contact postal,14
registrant contact postal,13
registrant contact phone,11
technical contact phone,9


For the 'technical contact postal', 'admin contact postal' and 'registrant contact postal', we fill the nan values with 0.	

In [35]:
domain['technical contact postal'].fillna(0, inplace = True)

In [36]:
domain['admin contact postal'].fillna(0, inplace = True)

In [37]:
domain['registrant contact postal'].fillna(0, inplace = True)

In [38]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val

Unnamed: 0,MissingValueCount
registrant contact phone,11
technical contact phone,9
admin contact phone,8
registrar,5
registrar status 1,3
name server 2 - host,3
registrant contact name,3
admin contact city,1
technical contact name,1
technical contact city,1


The contact phone for the registrant, technical and admin we can fill the nan values with 0. 

In [40]:
domain['technical contact phone'].fillna(0, inplace = True)

In [41]:
domain['registrant contact phone'].fillna(0, inplace = True)

In [42]:
domain['admin contact phone'].fillna(0, inplace = True)

In [43]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val

Unnamed: 0,MissingValueCount
registrar,5
registrar status 1,3
name server 2 - host,3
registrant contact name,3
technical contact city,1
technical contact name,1
admin contact city,1


For the registrar column, we fill the nan values with the most common value:

In [44]:
domain['registrar'].value_counts().head()

GO DADDY SOFTWARE INC     321
ENOM, INC                  65
NETWORK SOLUTIONS, LLC     38
FASTDOMAIN, INC            23
DREAMHOST, LLC             21
Name: registrar, dtype: int64

In [45]:
domain['registrar'].fillna('GO DADDY SOFTWARE INC', inplace = True)

Next on the missing value list:

In [46]:
domain['registrar status 1'].value_counts()

clientTransferProhibited                                                                        542
ok                                                                                               47
clientDeleteProhibited                                                                           43
clientdeleteprohibited clientrenewprohibited clienttransferprohibited clientupdateprohibited      7
clientUpdateProhibited                                                                            5
serverTransferProhibited                                                                          2
clienttransferprohibited clientupdateprohibited                                                   1
ok http://www.icann.org/epp#ok                                                                    1
Name: registrar status 1, dtype: int64

We will fill this with the most common

In [47]:
domain['registrar status 1'].fillna('clientTransferProhibited', inplace = True)

name server 2 - host:

In [48]:
domain[domain['name server 2 - host'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
66,bigbluedimension.com,https://whois.domaintools.com/bigbluedimension...,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,17/05/2016,17/05/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited
418,proud-patriots.com,https://whois.domaintools.com/proud-patriots.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,02/07/2016,02/07/2018,blank-nameserver.com,,"DREAMHOST, LLC",clientTransferProhibited
620,uspoliticsinsider.com,https://whois.domaintools.com/uspoliticsinside...,JORDAN DAVCHEV,"Domains By Proxy, LLC",ANDON SURKOV 8/17,VELES,0,1400,mk,38977910000.0,uspoliticsinsider@gmail.com,JORDAN DAVCHEV,"Domains By Proxy, LLC",ANDON SURKOV 8/17,VELES,0,1400,mk,38977910000.0,uspoliticsinsider@gmail.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,04/06/2016,04/06/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited


In [49]:
domain[domain['name server 1 - host'] == 'blank-nameserver.com'].head()

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
66,bigbluedimension.com,https://whois.domaintools.com/bigbluedimension...,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,TATJANA KARAKACHANOVA,"Domains By Proxy, LLC",BELGRADSKA 27,VELES,0,1400,mk,389078000000.0,stojanov_tose@yahoo.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,17/05/2016,17/05/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited
418,proud-patriots.com,https://whois.domaintools.com/proud-patriots.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,PROXY PROTECTION LLC,PROXY PROTECTION LLC,"417 ASSOCIATED RD #324,C/O PROUD-PATRIOTS.COM",BREA,CA,92821,us,17147060000.0,amh23trrj7ut9v8@proxy.dreamhost.com,02/07/2016,02/07/2018,blank-nameserver.com,,"DREAMHOST, LLC",clientTransferProhibited
620,uspoliticsinsider.com,https://whois.domaintools.com/uspoliticsinside...,JORDAN DAVCHEV,"Domains By Proxy, LLC",ANDON SURKOV 8/17,VELES,0,1400,mk,38977910000.0,uspoliticsinsider@gmail.com,JORDAN DAVCHEV,"Domains By Proxy, LLC",ANDON SURKOV 8/17,VELES,0,1400,mk,38977910000.0,uspoliticsinsider@gmail.com,BLUEHOST INC,BLUEHOST.COM,550 E TIMPANOGOS PKWY,OREM,UTAH,84097,us,18017660000.0,whois@bluehost.com,04/06/2016,04/06/2018,blank-nameserver.com,,"FASTDOMAIN, INC",clientTransferProhibited


In [50]:
domain['name server 2 - host'].fillna('blank-nameserver.com', inplace = True)

In [51]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val

Unnamed: 0,MissingValueCount
registrant contact name,3
technical contact name,1
admin contact city,1
technical contact city,1


In [52]:
domain[domain['registrant contact name'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
428,realclearpolitics.com,https://whois.domaintools.com/realclearpolitic...,contactprivacy.com,"Domains By Proxy, LLC",96 Mowat Ave,Toronto,Ontario,M6K 3M1,ca,14165390000.0,realclearpolitics.com@contactprivacy.com,,Contactprivacy.com,96 Mowat Ave,Toronto,Ontario,M6K 3M1,ca,14165390000.0,realclearpolitics.com@contactprivacy.com,contactprivacy.com,"Domains By Proxy, LLC",96 Mowat Ave,Toronto,Ontario,M6K 3M1,ca,14165390000.0,realclearpolitics.com@contactprivacy.com,03/02/2000,03/02/2018,ns10.dnsmadeeasy.com,ns11.dnsmadeeasy.com,GO DADDY SOFTWARE INC,clientTransferProhibited
527,thenewamerican.com,https://whois.domaintools.com/thenewamerican.com,Brian Witt,"Domains By Proxy, LLC",The John Birch Society PO Box 8040 770 Westhil...,,0,0,us,9207494000.0,webmaster@jbs.org,,PO Box 8040,770 Westhill Blvd,Appleton,Wisconsin,54912,us,0.0,webmaster@jbs.org,Brian Witt,"Domains By Proxy, LLC",The John Birch Society Post Office Box 8040 77...,,0,0,us,9207494000.0,webmaster@jbs.org,15/04/1999,15/04/2018,dns4.rr.com,ns1.biz.rr.com,GO DADDY SOFTWARE INC,clientTransferProhibited
613,usconservative.com,https://whois.domaintools.com/usconservative.com,Bill Loran,Gregory Management Co,620 Shelby Street,Bristol,Tennessee,37620,us,4237930000.0,loranb@jrgregory.com,,Gregory Management Co,620 Shelby Street,Bristol,Tennessee,37620,us,0.0,nomail@godaddy.com,,Gregory Management Co,620 Shelby Street,Bristol,Tennessee,37620,us,0.0,nomail@godaddy.com,22/12/2001,22/12/2018,customer.buydomains.com,customer2.buydomains.com,GO DADDY SOFTWARE INC,clientTransferProhibited


In [53]:
domain['registrant contact name'].fillna(domain['admin contact name'], inplace = True)

Same with technical contact name. 

In [54]:
domain['technical contact name'].fillna(domain['admin contact name'], inplace = True)

In [55]:
domain[domain['admin contact city'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
527,thenewamerican.com,https://whois.domaintools.com/thenewamerican.com,Brian Witt,"Domains By Proxy, LLC",The John Birch Society PO Box 8040 770 Westhil...,,0,0,us,9207494000.0,webmaster@jbs.org,Brian Witt,PO Box 8040,770 Westhill Blvd,Appleton,Wisconsin,54912,us,0.0,webmaster@jbs.org,Brian Witt,"Domains By Proxy, LLC",The John Birch Society Post Office Box 8040 77...,,0,0,us,9207494000.0,webmaster@jbs.org,15/04/1999,15/04/2018,dns4.rr.com,ns1.biz.rr.com,GO DADDY SOFTWARE INC,clientTransferProhibited


In [56]:
domain[domain['technical contact city'].isna()]

Unnamed: 0,domain,3,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
527,thenewamerican.com,https://whois.domaintools.com/thenewamerican.com,Brian Witt,"Domains By Proxy, LLC",The John Birch Society PO Box 8040 770 Westhil...,,0,0,us,9207494000.0,webmaster@jbs.org,Brian Witt,PO Box 8040,770 Westhill Blvd,Appleton,Wisconsin,54912,us,0.0,webmaster@jbs.org,Brian Witt,"Domains By Proxy, LLC",The John Birch Society Post Office Box 8040 77...,,0,0,us,9207494000.0,webmaster@jbs.org,15/04/1999,15/04/2018,dns4.rr.com,ns1.biz.rr.com,GO DADDY SOFTWARE INC,clientTransferProhibited


We only have one admin contact city and technical contact city that are missing, and it is in the same row. 
In the dataset we can see that registrant contact city, admin contact city and technical city has been the same for multiple rows, so we will fill the missing values with the corresponding registrant contact city. 

In [57]:
domain['technical contact city'].fillna(domain['registrant contact city'], inplace = True)

In [58]:
domain['admin contact city'].fillna(domain['registrant contact city'], inplace = True)

Column named 3 does not give us any  more information than the domain column so we can remove this. 

In [59]:
domain = domain.drop(['3'], axis = 1)

In [60]:
domain.head()

Unnamed: 0,domain,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1
0,100percentfedup.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,13/03/2012,29/09/2020,ed.ns.cloudflare.com,gina.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
1,21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited
2,24dailynew.com,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,21/02/2017,21/02/2018,ns201.ehosts.com,ns202.ehosts.com,PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM,clientTransferProhibited
3,24usnews.com,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,24usnewss@gmail.com,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,24usnewss@gmail.com,Aleksandar Nikolov,24usnews,Georgi Dimitrov 7,Veles,XX,1400,mk,38976250000.0,24usnewss@gmail.com,03/07/2016,03/07/2018,ns1.renewyourname.net,ns2.renewyourname.net,TUCOWS DOMAINS INC,clientTransferProhibited
4,63red.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,05/12/2011,05/12/2017,ns1.digitalocean.com,ns2.digitalocean.com,"ENOM, INC",clientTransferProhibited


Checking how many missing values we have left:

In [61]:
miss_val = (domain.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount


Everything is replaced/removed!

## PAGE_INFO

In [62]:
page_info['website'] = page_info['website'].str.lower()
page_info.head()

Unnamed: 0,page_name,about,fan_count,talking_about_count,website,page_id,year,month,day
0,Americans Against the Tea Party,We are your go to source for political news.,583256,43343,http://www.aattp.org,108038612554992,2009.0,4.0,3.0
1,act.tv,Rise up and Resist! Your home for movement-ori...,285075,481748,http://act.tv,153418591515382,,,
2,New Blue United,,1476093,93116,www.bluetribune.com,188464111175168,,,
3,Obama is the Worst President in US History,,1569590,41452,,296856040436954,2013.0,1.0,1.0
4,RedFlag NewsDesk,Daily headlines from the official RedFlagNews....,1533,76,http://redflagnews.com,492836854251934,2016.0,3.0,16.0


Removing www. and http:// from the websites

In [63]:
def remove_punc(text):
    text = str(text).replace('www.', '').replace('http://', '')
    return text

page_info['website'] = page_info['website'].apply(remove_punc)

In [65]:
miss_val = (page_info.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
day,362
month,332
year,241
about,26


Day, month and year has a lot of missing values, and we can remove these. 

In [66]:
page_info = page_info.drop(['day', 'month', 'year'], axis = 1)

Checking the missing values again:

In [67]:
miss_val = (page_info.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount
about,26


In [68]:
page_info['about'] = page_info['about'].replace(np.nan, 0)

In [69]:
page_info.head()

Unnamed: 0,page_name,about,fan_count,talking_about_count,website,page_id
0,Americans Against the Tea Party,We are your go to source for political news.,583256,43343,aattp.org,108038612554992
1,act.tv,Rise up and Resist! Your home for movement-ori...,285075,481748,act.tv,153418591515382
2,New Blue United,0,1476093,93116,bluetribune.com,188464111175168
3,Obama is the Worst President in US History,0,1569590,41452,,296856040436954
4,RedFlag NewsDesk,Daily headlines from the official RedFlagNews....,1533,76,redflagnews.com,492836854251934


In [70]:
miss_val = (page_info.isnull().sum())
miss_val = miss_val.sort_values(ascending=False)
miss_val = pd.DataFrame({'MissingValueCount' : miss_val})
miss_val = miss_val[miss_val['MissingValueCount'] > 0]
miss_val.head()

Unnamed: 0,MissingValueCount


Now we have removed and replaced all the nan values. 
We do not remove any more columns as they can give valuable information in the classifiers. 

### Combining data set

In [71]:
df = pd.merge(left=sites, right=domain, left_on='site', right_on='domain')
df_merge = pd.merge(left=df, right=page_info, left_on='site', right_on='website')

In [72]:
np.shape(df_merge)

(288, 43)

In [73]:
df_merge.to_csv('df_merge.csv')  

In [85]:
df_merge.head()

Unnamed: 0,site,political_category,macedonian,domain,admin contact name,admin contact org,admin contact street,admin contact city,admin contact state,admin contact postal,admin contact country,admin contact phone,admin contact email 1,registrant contact name,registrant contact org,registrant contact street,registrant contact city,registrant contact state,registrant contact postal,registrant contact country,registrant contact phone,registrant contact email 1,technical contact name,technical contact org,technical contact street,technical contact city,technical contact state,technical contact postal,technical contact country,technical contact phone,technical contact email 1,create date,expiration date,name server 1 - host,name server 2 - host,registrar,registrar status 1,page_name,about,fan_count,talking_about_count,website,page_id
0,100percentfedup.com,1,0,100percentfedup.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,Registration Private,"Domains By Proxy, LLC","DomainsByProxy.com,14455 N. Hayden Road",Scottsdale,Arizona,85260,us,14806240000.0,100percentfedup.com@domainsbyproxy.com,13/03/2012,29/09/2020,ed.ns.cloudflare.com,gina.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited,100 Percent FED Up,We are two moms inspired by the life of Andrew...,1579365,174924,100percentfedup.com,311190048935167
1,21stcenturywire.com,0,0,21stcenturywire.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,P. Henningsen,"Domains By Proxy, LLC","Unit 220, 8-10 Sunnyhill Road",London,0,SW16 2BJ,gb,447547000000.0,pj.henningsen@gmail.com,03/11/2009,03/11/2017,alan.ns.cloudflare.com,lisa.ns.cloudflare.com,GO DADDY SOFTWARE INC,clientTransferProhibited,21st Century Wire,"""NEWS FOR THE WAKING GENERATION""",32007,951,21stcenturywire.com,182032255155419
2,63red.com,1,0,63red.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,WHOIS AGENT,"WHOIS PRIVACY PROTECTION SERVICE, INC","PO BOX 639,C/O 63RED.COM",KIRKLAND,WA,98083,us,14252740000.0,tcwrnqtdq@whoisprivacyprotect.com,05/12/2011,05/12/2017,ns1.digitalocean.com,ns2.digitalocean.com,"ENOM, INC",clientTransferProhibited,63red,"Conservative news on iPhone, iPad and web.",50,0,63red.com,267165846826107
3,aattp.org,0,0,aattp.org,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,"Domain Admin, C/O ID#10760",Privacy Protection Service INC d/b/a PrivacyPr...,PO Box 16,Nobby Beach,Queensland,QLD 4218,au,4536947000.0,contact@privacyprotect.org,03/12/2011,03/12/2017,mario.ns.cloudflare.com,robin.ns.cloudflare.com,PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM,clientTransferProhibited,Americans Against the Tea Party,We are your go to source for political news.,583256,43343,aattp.org,108038612554992
4,act.tv,0,0,act.tv,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,BOB FERTIK,E-DEMOCRACY GROUP,BOX 1452,NEW YORK,NY,10276,us,11234570000.0,bob.fertik@gmail.com,31/07/2003,31/07/2017,barbara.ns.cloudflare.com,brad.ns.cloudflare.com,"ENOM, INC",clientTransferProhibited,act.tv,Rise up and Resist! Your home for movement-ori...,285075,481748,act.tv,153418591515382
