In [241]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error,r2_score

In [242]:
Drug = pd.read_csv(r"dataset .csv")

In [243]:
Drug.shape

(3007, 23)

In [244]:
Drug.describe()

Unnamed: 0,SNO,NLME_RESULT_ID,NLME_CURVE_ID,COMPANY_ID,COSMIC_ID,DRUG_ID,MIN_CONC,MAX_CONC,LN_IC50
count,3002.0,3001.0,3001.0,3002.0,3002.0,3001.0,3001.0,3000.0,3002.0
mean,1501.498668,342.0,15727830.0,1045.0,823673.569287,46.938687,0.017279,4.418907,1.547655
std,866.744774,0.0,88604.92,0.0,93094.62585,72.225694,0.019293,4.935705,2.407004
min,1.0,342.0,15580430.0,1045.0,683667.0,1.0,0.000391,0.1,-8.727691
25%,751.25,342.0,15652830.0,1045.0,753531.0,6.0,0.007813,2.0,0.311601
50%,1501.5,342.0,15723440.0,1045.0,905955.0,29.0,0.007813,2.0,2.106095
75%,2251.75,342.0,15794280.0,1045.0,907175.0,35.0,0.02,5.12,3.142392
max,3000.0,342.0,15901390.0,1045.0,909702.0,249.0,0.0625,16.0,6.586846


In [245]:
Drug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SNO              3002 non-null   float64
 1   DATASET          3002 non-null   object 
 2   NLME_RESULT_ID   3001 non-null   float64
 3   NLME_CURVE_ID    3001 non-null   float64
 4   COMPANY_ID       3002 non-null   float64
 5   SANGER_MODEL_ID  3001 non-null   object 
 6   COSMIC_ID        3002 non-null   float64
 7   CELL_LINE_NAME   3002 non-null   object 
 8   TCGA_DESC        3001 non-null   object 
 9   TISSUE           2985 non-null   object 
 10  TISSUE_SUBTYPE   2993 non-null   object 
 11  DRUG_ID          3001 non-null   float64
 12  DRUG_NAME        3002 non-null   object 
 13  PUTATIVE_TARGET  2999 non-null   object 
 14  PATHWAY_NAME     2999 non-null   object 
 15  MIN_CONC         3001 non-null   float64
 16  MAX_CONC         3000 non-null   float64
 17  LN_IC50       

In [246]:
Drug.isnull().sum()

SNO                 5
DATASET             5
NLME_RESULT_ID      6
NLME_CURVE_ID       6
COMPANY_ID          5
SANGER_MODEL_ID     6
COSMIC_ID           5
CELL_LINE_NAME      5
TCGA_DESC           6
TISSUE             22
TISSUE_SUBTYPE     14
DRUG_ID             6
DRUG_NAME           5
PUTATIVE_TARGET     8
PATHWAY_NAME        8
MIN_CONC            6
MAX_CONC            7
LN_IC50             5
GENE                6
TRANSCRIPT          6
cDNA                5
AA_POSITION         5
GENE_LIST           5
dtype: int64

In [247]:
for i in Drug.select_dtypes(include="object"):
    Drug[i].fillna(Drug[i].mode()[0],inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Drug[i].fillna(Drug[i].mode()[0],inplace= True)


In [248]:
Drug.select_dtypes(include="float64").columns

Index(['SNO', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COMPANY_ID', 'COSMIC_ID',
       'DRUG_ID', 'MIN_CONC', 'MAX_CONC', 'LN_IC50'],
      dtype='object')

In [249]:
from sklearn.impute import SimpleImputer


In [250]:
si = SimpleImputer(strategy="mean")
ar = si.fit_transform(Drug[['SNO', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COMPANY_ID', 'COSMIC_ID','DRUG_ID', 'MIN_CONC', 'MAX_CONC', 'LN_IC50']])

In [251]:
new_data = pd.DataFrame(ar,columns=Drug.select_dtypes(include="float64").columns)

In [252]:
Drug[['SNO', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COMPANY_ID', 'COSMIC_ID',
       'DRUG_ID', 'MIN_CONC', 'MAX_CONC', 'LN_IC50']] = new_data

In [253]:
Drug.describe()

Unnamed: 0,SNO,NLME_RESULT_ID,NLME_CURVE_ID,COMPANY_ID,COSMIC_ID,DRUG_ID,MIN_CONC,MAX_CONC,LN_IC50
count,3007.0,3007.0,3007.0,3007.0,3007.0,3007.0,3007.0,3007.0,3007.0
mean,1501.498668,342.0,15727830.0,1045.0,823673.569287,46.938687,0.017279,4.418907,1.547655
std,866.023629,0.0,88516.45,0.0,93017.169621,72.153577,0.019274,4.929955,2.405002
min,1.0,342.0,15580430.0,1045.0,683667.0,1.0,0.000391,0.1,-8.727691
25%,752.5,342.0,15653020.0,1045.0,753531.0,6.0,0.007813,2.0,0.330727
50%,1501.498668,342.0,15723640.0,1045.0,905955.0,29.0,0.007813,2.0,2.103799
75%,2250.5,342.0,15794230.0,1045.0,907175.0,35.0,0.02,5.12,3.141288
max,3000.0,342.0,15901390.0,1045.0,909702.0,249.0,0.0625,16.0,6.586846
