The data set for this project is from the UCI Machine Learning Repository.

https://archive.ics.uci.edu/ml/datasets/p53+Mutants 

# Imported the required packages

In [1]:
import pandas as pd
import numpy as np
import re
import scipy
from scipy import stats

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read in the data set as df

The data set consisted of 2 files – the data file, K9.data and the K9.instance.tags file that contains the names of the mutations. Both files were downloaded to the local hard drive. The .data file was read in using pd.read_csv and saved to a dataframe, ‘df’.

In [3]:
df = pd.read_csv('./K9.data', low_memory=False, header=None)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31420 entries, 0 to 31419
Columns: 5410 entries, 0 to 5409
dtypes: float64(1), object(5409)
memory usage: 1.3+ GB


In [5]:
df.dtypes

0        object
1        object
2        object
3        object
4        object
         ...   
5405     object
5406     object
5407     object
5408     object
5409    float64
Length: 5410, dtype: object

In [6]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5400,5401,5402,5403,5404,5405,5406,5407,5408,5409
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,inactive,
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,


There are 31,420 entries and 5410 features and all the columns are of float type except for one which is an object type.

To minimize the confusion between numbered columns and rows, the prefix ‘2D’ was attached to the first 4826 column numbers and the prefix ‘3D’ to the 4827th – 5408th column using string methods to identify the 2D and 3D features. The 5409th column was labeled as ‘Type’ as it indicated the class label. 

In [7]:
# Adding the prefix '2D' and '3D' to the column names. 0-4825 - add 2D. 4826-5407 - add 3D. 
# The column with the labels active/inactive is named 'Type'
columns_string = []
for i in df.columns:
    if int(i) <= 4825:
        i = '2D'+str(i+1)
        columns_string.append(i)
    elif int(i) >= 4826 and  i <= 5407:
        i = '3D'+str(i+1)
        columns_string.append(i)
    elif int(i) == 5408:
        i = 'Type'
        columns_string.append(i)
    else:
        i = i
        columns_string.append(i)
columns_string[-10:]

['3D5401',
 '3D5402',
 '3D5403',
 '3D5404',
 '3D5405',
 '3D5406',
 '3D5407',
 '3D5408',
 'Type',
 5409]

In [8]:
df.columns = columns_string

The tags  or the mutation information of the data set is in another file 'instance.tags'. Read in this data and add it as a feature to df.

In [9]:
tags = pd.read_csv('./K9.instance.tags', header=None)

In [10]:
# Add the tags as a column to the dataframe df.
df['mutations'] = tags

In [11]:
df.head(5)

Unnamed: 0,2D1,2D2,2D3,2D4,2D5,2D6,2D7,2D8,2D9,2D10,...,3D5402,3D5403,3D5404,3D5405,3D5406,3D5407,3D5408,Type,5409,mutations
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,,%a119e
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,,%a119e_l125p
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,inactive,,%a119e_r283k_a353v
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,,%a161t
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,,%c135y


The column with NaNs was dropped. There were also some rows with ‘?’. Since, there were no descriptions of the features available and there were 5408 of them for each mutation, it seemed best to drop all missing values after converting them to NaNs. 

In [12]:
# The column 5409 is all NaN, so dropped it.
df = df.drop([5409], axis=1)

In [13]:
df.head(5)

Unnamed: 0,2D1,2D2,2D3,2D4,2D5,2D6,2D7,2D8,2D9,2D10,...,3D5401,3D5402,3D5403,3D5404,3D5405,3D5406,3D5407,3D5408,Type,mutations
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,%a119e
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,%a119e_l125p
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,inactive,%a119e_r283k_a353v
3,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,%a161t
4,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,%c135y


In [14]:
# The dataframe has ? and NaN for some values and for full rows. Converted all ? to NaN and dropped it.
df.replace('?', np.nan, inplace = True)

In [15]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [16]:
df.isnull().sum().sum()

0

In [17]:
df.head()

Unnamed: 0,2D1,2D2,2D3,2D4,2D5,2D6,2D7,2D8,2D9,2D10,...,3D5401,3D5402,3D5403,3D5404,3D5405,3D5406,3D5407,3D5408,Type,mutations
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.013,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,%a119e
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,%a119e_l125p
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.01,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,%a161t
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.012,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,%c135y
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,inactive,%c135y_e285m


In [18]:
sum(df['Type']=='inactive')

31008

In [19]:
sum(df['Type']=='active')

151

In [20]:
df.shape

(31159, 5410)

The ‘mutations’ column contains the mutation information for each entry/row.

For eg:  ‘%a119e_l125p’ denotes 2 point mutations separated by ‘_’, the first one is ‘a119e’ and the second one is ‘l125p’. The ‘a119e’ is read as – amino acid ‘a’ at position 119 of the protein chain is replaced by amino acid ‘e’. Based on the number of ‘_’, the ‘count’ of the mutations was determined and added as a column to the dataframe, df.

In [21]:
def mutation_counter(col):
    ### This function counts the number of mutations in each entry of the dataframe column 'mutations' ###
    ### and returns the length of each type of mutation in 4 different lists ('one','two','three','four','more') ###
    ### and a fifth list, 'count' which contains the designation for each entry as a 1,2,3,4 or >4. ###
    
    count = []                                                       # list for designating the type of mutation
    
    for i in col:                                                       # Iterating through the different entries in the column
        c = i.count('_')                                                # c is the number of '_' in the entry
        if c == 0:                                                      # if c is 0, its a one point mutation
            count.append(1)                                           # designated as a 1 in the count list
        else:                                                    # if c is 1, its a two point mutation
            count.append(c+1)                                           # designated as a 2 in the count list
        
            
    return(count)                            # return the length of each list

In [22]:
# Count the number of each mutation in the dataframe df
count = mutation_counter(df['mutations'])

In [23]:
print(count)

[1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 5, 6, 2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 5, 3, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [24]:
# Using the function mutation_counter on the 'mutations' column, generated the list 'count' and added it as another 
# column 'count' to the dataframe

df['count'] = count

In [25]:
df.head()

Unnamed: 0,2D1,2D2,2D3,2D4,2D5,2D6,2D7,2D8,2D9,2D10,...,3D5402,3D5403,3D5404,3D5405,3D5406,3D5407,3D5408,Type,mutations,count
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.021,0.02,0.016,-0.011,0.003,0.01,-0.007,inactive,%a119e,1
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,inactive,%a119e_l125p,2
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,inactive,%a161t,1
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.05,0.038,0.051,-0.015,0.017,0.027,-0.049,inactive,%c135y,1
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,inactive,%c135y_e285m,2


In [26]:
max_count = df['count'].max()
print(max_count)

6


The dataset has 1-6 point mutations. 

In human mammary carcinoma, the mutation sites of p53 tend to occur within one single exon (coding region) or a short distance from another, implying that the distance of mutations may be of importance for affecting the function of p53. Reference: Tao Huang, Shen Niu, Zhongping Xu, Yun Huang, Xiangyin Kong, Yu-Dong Cai and Kuo-Chen (2011) ChouPredicting transcriptional activity of multiple site p53 mutants based on hybrid properties. PLoS One 6(8): e22940. 

Here we used 1,2,3,4 and 5 distance features for 1 pt., 2 pt., 3pt., 4pt., 5pt. and 6pt. mutations. The distance features represent the distance between adjacent mutations, ie., the distance between the first and second mutations (difference in position numbers), second and third, and so on. 

For e.g., in a 3pt. mutation, there will be 2 distance features – one, the distance between the first and second mutation positions and the second, the distance between the second and third mutation positions. Five such distance features named ‘distance1’,…,’distance5’ were added to the data set.

In [27]:
# Adding the distance feature of the mutations - the position of the 2 point where the mutation occurred is taken and 
# the absolute value of their difference is the distance
# For the one point mutation, the dist1 is 0
# for the 2 pt mutation, the dist1 is the distance
# for the 3 pt mutation, dist1 will have 1st diff and dist2 will have the 2nd diff
# for the 4 pt mutation, dist1, dist2 and dist3
# for the 5 pt mutation, dist1, dist2 and dist3, dist4
# for the 6 pt mutation, dist1, dist2 and dist3, dist4, dist5

dist1 = []                                                            # empty lists 'dist1' to 'dist5'
dist2 = []
dist3 = []
dist4 = []
dist5 = []

for i in df['mutations']:                                            # iterating through each entry in 'mutations' column
    c = i.count('_')                                                 # c is the count of '_'
    if c == 0:                                                       # if its 0, then 1 pt mutation
        dist1.append(0)                                              # and all dist lists are appended as 0 for that entry
        dist2.append(0)
        dist3.append(0)
        dist4.append(0)
        dist5.append(0)
    elif c == 1:                                                     # If c is 1, then it is a 2 pt mutation
        a, b = i.strip('%').split('_')                               # strip off '%' and split at '_' and get 2 strings - a & b
        match_a = re.match(r"([a-z]+)([0-9]+)([a-z]+)", a, re.I)     # re for a and b - alphabet/number/alphabet
        match_b = re.match(r"([a-z]+)([0-9]+)([a-z]+)", b, re.I)    
        if match_a:
            items_a = match_a.groups()                               # group the 3 diff components into resp items_a, items_b
            pos1 = int(items_a[1])                                   # the 2nd component of each items is the 
        if match_b:                                                  # position of the aminoacid
            items_b = match_b.groups()                              
            pos2 = int(items_b[1])                                   
        dist1.append(abs(pos1 - pos2))                               # absolute difference of the positions is the distance in dist1
        dist2.append(0)                                              # the rest are appended with '0'
        dist3.append(0)
        dist4.append(0)
        dist5.append(0)
    elif c == 2:                                                     # If c is 2, then it is a 3 pt mutation
        a, b, c = i.strip('%').split('_')                            # strip off '%' and split at '_' and get 3 strings - a,b,c
        match_a = re.match(r"([a-z]+)([0-9]+)([a-z]+)", a, re.I)     # re for a,b,c - alphabet/number/alphabet
        match_b = re.match(r"([a-z]+)([0-9]+)([a-z]+)", b, re.I)     
        match_c = re.match(r"([a-z]+)([0-9]+)([a-z]+)", c, re.I)
        if match_a:
            items_a = match_a.groups()                               # group the 3 diff components into items_a, items_b, items_c
            pos1 = int(items_a[1])                                   # the 2nd component of all items is the position of the aminoacid
        if match_b:    
            items_b = match_b.groups()                               
            pos2 = int(items_b[1])                                   
        if match_c:    
            items_c = match_c.groups()                               
            pos3 = int(items_c[1])
        dist1.append(abs(pos1 - pos2))                               # dist 1 has 1st diff and dist2 has 2nd diff
        dist2.append(abs(pos2 - pos3))
        dist3.append(0)                                              # the rest are appended with '0'
        dist4.append(0)
        dist5.append(0)
    elif c == 3:                                                     # If c is 3, then it is a 4 point mutation
        a, b, c, d = i.strip('%').split('_')                         # strip off '%', split at '_' , get 4 strings - a,b,c,d
        match_a = re.match(r"([a-z]+)([0-9]+)([a-z]+)", a, re.I)     # re for a,b,c,d - alphabet/number/alphabet
        match_b = re.match(r"([a-z]+)([0-9]+)([a-z]+)", b, re.I)     
        match_c = re.match(r"([a-z]+)([0-9]+)([a-z]+)", c, re.I)
        match_d = re.match(r"([a-z]+)([0-9]+)([a-z]+)", d, re.I)     
        if match_a:
            items_a = match_a.groups()                               # group the 3 diff components into items_a,items_b,items_c,items_d
            pos1 = int(items_a[1])                                   
        if match_b:    
            items_b = match_b.groups()                               
            pos2 = int(items_b[1])                                   
        if match_c:    
            items_c = match_c.groups()                               
            pos3 = int(items_c[1])
        if match_d:    
            items_d = match_d.groups()                               
            pos4 = int(items_d[1])
        dist1.append(abs(pos1 - pos2))                               # dist1 is 1st diff, dist2 is 2nd diff and dist3 is 3rd diff 
        dist2.append(abs(pos2 - pos3))
        dist3.append(abs(pos3 - pos4))
        dist4.append(0)                                              # the rest are appended with '0'
        dist5.append(0)
    elif c == 4:                                                     # If c is 4, then it is a 5 pt mutation
        a, b, c, d, e = i.strip('%').split('_')                      # strip off '%', split at '_', get 5 strings - a,b,c,d,e
        match_a = re.match(r"([a-z]+)([0-9]+)([a-z]+)", a, re.I)     # re for a,b,c,d,e - alphabet/number/alphabet
        match_b = re.match(r"([a-z]+)([0-9]+)([a-z]+)", b, re.I)     
        match_c = re.match(r"([a-z]+)([0-9]+)([a-z]+)", c, re.I)
        match_d = re.match(r"([a-z]+)([0-9]+)([a-z]+)", d, re.I)     
        match_e = re.match(r"([a-z]+)([0-9]+)([a-z]+)", e, re.I)
        if match_a:
            items_a = match_a.groups()                               # group the 3 diff components into items_a,items_b,items_c,items_d
            pos1 = int(items_a[1])                                   # and items_e
        if match_b:                                                  # the 2nd component of items_b is the position of the aminoacid
            items_b = match_b.groups()                               
            pos2 = int(items_b[1])                                   
        if match_c:    
            items_c = match_c.groups()                               
            pos3 = int(items_c[1])
        if match_d:    
            items_d = match_d.groups()                               
            pos4 = int(items_d[1])
        if match_e:    
            items_e = match_e.groups()                               
            pos5 = int(items_e[1])
        dist1.append(abs(pos1 - pos2))                               # dist1-1st, dist2-2nd, dist3-3rd, dist4-4th
        dist2.append(abs(pos2 - pos3))
        dist3.append(abs(pos3 - pos4))
        dist4.append(abs(pos4 - pos5))
        dist5.append(0)                                              # the rest are appended with '0'
    elif c == 5:                                                     # If c is 5, then it is a 6 pt mutation
        a, b, c, d, e, f = i.strip('%').split('_')                   # strip off '%', split at '_', get 6 strings - a,b,c,d,e,f
        match_a = re.match(r"([a-z]+)([0-9]+)([a-z]+)", a, re.I)     # re for a,b,c,d,e,f - alphabet/number/alphabet
        match_b = re.match(r"([a-z]+)([0-9]+)([a-z]+)", b, re.I)     
        match_c = re.match(r"([a-z]+)([0-9]+)([a-z]+)", c, re.I)
        match_d = re.match(r"([a-z]+)([0-9]+)([a-z]+)", d, re.I)     
        match_e = re.match(r"([a-z]+)([0-9]+)([a-z]+)", e, re.I)
        match_f = re.match(r"([a-z]+)([0-9]+)([a-z]+)", f, re.I)
        if match_a:
            items_a = match_a.groups()                               # group the 3 diff components into items_a, items_b, items_c, items_d
            pos1 = int(items_a[1])                                   # items_e, items_f
        if match_b:                                                  # the 2nd component of items_b is the position of the aminoacid
            items_b = match_b.groups()                               
            pos2 = int(items_b[1])                                   
        if match_c:    
            items_c = match_c.groups()                               
            pos3 = int(items_c[1])
        if match_d:    
            items_d = match_d.groups()                               
            pos4 = int(items_d[1])
        if match_e:    
            items_e = match_e.groups()                               
            pos5 = int(items_e[1])
        if match_f:    
            items_f = match_f.groups()                               
            pos6 = int(items_f[1])
        dist1.append(abs(pos1 - pos2))                                # dist1-1st, dist2-2nd, dist3-3rd, dist4-4th, dist5-5th 
        dist2.append(abs(pos2 - pos3))
        dist3.append(abs(pos3 - pos4))
        dist4.append(abs(pos4 - pos5))
        dist5.append(abs(pos5 - pos6))                                # the rest are appended with '0'                                                               
                                                                     

In [28]:
# Added the distance lists as columns 
df['distance1'] = dist1
df['distance2'] = dist2
df['distance3'] = dist3
df['distance4'] = dist4
df['distance5'] = dist5

In [29]:
df.head()

Unnamed: 0,2D1,2D2,2D3,2D4,2D5,2D6,2D7,2D8,2D9,2D10,...,3D5407,3D5408,Type,mutations,count,distance1,distance2,distance3,distance4,distance5
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.0,-0.015,...,0.01,-0.007,inactive,%a119e,1,0,0,0,0,0
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.0,-0.002,...,0.013,0.005,inactive,%a119e_l125p,2,6,0,0,0,0
2,-0.169,-0.025,-0.01,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.016,-0.018,inactive,%a161t,1,0,0,0,0,0
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.02,-0.019,...,0.027,-0.049,inactive,%c135y,1,0,0,0,0,0
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,0.009,0.013,inactive,%c135y_e285m,2,150,0,0,0,0


There were 31269 ‘inactive’ entries and only 151 ‘active’ entries in the dataset initially. But, after removing the missing values and NaNs, adding the ‘count’ and distance features, the data frame has 31159 entries with 5416 columns. The number of ‘inactive’ entries is now 31008 and 151 ‘active’ entries.

In [30]:
df.shape

(31159, 5416)

In [31]:
# Number of 'active' and 'inactive' records

sum(df['Type'] == 'active')

151

In [32]:
sum(df['Type'] == 'inactive')

31008

In [33]:
# converting everything in the dataframe to numeric
df = df.apply(pd.to_numeric, errors='ignore')

In [34]:
df_cols = df.columns