### Sci Kit Learn (sklearn)

In [32]:
import numpy, matplotlib, scipy, pandas as pd
import sklearn

In [8]:
numpy.__version__, scipy.__version__, matplotlib.__version__, sklearn.__version__

('1.23.5', '1.10.0', '3.7.0', '1.2.1')

### Sklearn built-in Datasets..

#### i) Toy Datasets

In [9]:
from sklearn.datasets import load_diabetes

In [26]:
diabetes = load_diabetes(as_frame=True, scaled=False) #return as a data frame and doesnot scale data
type(diabetes) #bunch object is like a dictionary with keys ad values

sklearn.utils._bunch.Bunch

In [27]:
diabetes.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [28]:
diabetes.get("data") # data key return a numpy array of the data
diabetes["data"]
diabetes.data # returns the data frame

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,59.0,2.0,32.1,101.00,157.0,93.2,38.0,4.00,4.8598,87.0
1,48.0,1.0,21.6,87.00,183.0,103.2,70.0,3.00,3.8918,69.0
2,72.0,2.0,30.5,93.00,156.0,93.6,41.0,4.00,4.6728,85.0
3,24.0,1.0,25.3,84.00,198.0,131.4,40.0,5.00,4.8903,89.0
4,50.0,1.0,23.0,101.00,192.0,125.4,52.0,4.00,4.2905,80.0
...,...,...,...,...,...,...,...,...,...,...
437,60.0,2.0,28.2,112.00,185.0,113.8,42.0,4.00,4.9836,93.0
438,47.0,2.0,24.9,75.00,225.0,166.0,42.0,5.00,4.4427,102.0
439,60.0,2.0,24.9,99.67,162.0,106.6,43.0,3.77,4.1271,95.0
440,36.0,1.0,30.0,95.00,201.0,125.2,42.0,4.79,5.1299,85.0


In [20]:
diabetes.target # series object

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [21]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [22]:
diabetes.data_filename

'diabetes_data_raw.csv.gz'

In [23]:
diabetes.target_filename

'diabetes_target.csv.gz'

In [25]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [30]:
diabetes = load_diabetes(as_frame=True) # scaled data
diabetes.data

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


#### ii) Real World Datasets

In [31]:
from sklearn.datasets import  fetch_california_housing
california = fetch_california_housing(as_frame=True, download_if_missing=True)
type(california)

sklearn.utils._bunch.Bunch

#### Making Dataframe from bunch Object of sklearn

In [34]:
df = pd.DataFrame(data=california.data, columns=california.feature_names)
df["Price"] = california.target
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [35]:
df.info() #pre processed dataset ready for usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Price       20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


#### iii) Download from Public ML Repositories 

In [36]:
from sklearn import datasets

In [37]:
titanic = datasets.fetch_openml(name='titanic', version=1)
type(titanic)

  warn(


sklearn.utils._bunch.Bunch

In [38]:
titanic.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [39]:
titanic.url

'https://www.openml.org/d/40945'

In [40]:
titanic.details

{'id': '40945',
 'name': 'Titanic',
 'version': '1',
 'description_version': '7',
 'format': 'ARFF',
 'upload_date': '2017-10-16T01:17:36',
 'licence': 'Public',
 'url': 'https://api.openml.org/data/v1/download/16826755/Titanic.arff',
 'parquet_url': 'http://openml1.win.tue.nl/dataset40945/dataset_40945.pq',
 'file_id': '16826755',
 'default_target_attribute': 'survived',
 'tag': 'text_data',
 'visibility': 'public',
 'minio_url': 'http://openml1.win.tue.nl/dataset40945/dataset_40945.pq',
 'status': 'active',
 'processing_date': '2018-10-04 07:19:36',
 'md5_checksum': '60ac7205eee0ba5045c90b3bba95b1c4'}

In [41]:
df = pd.DataFrame(data= titanic.data, columns=titanic.feature_names)
df['Target']=titanic.target
df

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,Target
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO",1
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",1
2,1.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,,0
1305,3.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,,0
1306,3.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,,0
1307,3.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,,0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   float64 
 5   parch      1309 non-null   float64 
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    object  
 12  home.dest  745 non-null    object  
 13  Target     1309 non-null   category
dtypes: category(3), float64(5), object(6)
memory usage: 116.8+ KB


### iv) Synthetic Datasets
#### a) Regression

In [44]:
X, y = datasets.make_regression(n_samples=8, n_features=3, noise=2 ,random_state=54, n_targets=1)
X

array([[ 0.58302756, -1.67541955, -1.47097856],
       [ 0.73338755,  0.62614846, -1.35413559],
       [ 1.15501298, -0.0943752 , -0.23167791],
       [ 0.07856383, -1.08099899, -1.85221074],
       [-1.5635627 ,  2.50846758, -0.78087024],
       [-0.90346546, -0.81955082, -1.15683452],
       [-0.36119376, -0.37405407, -0.52918568],
       [ 0.18552945,  0.67755519, -0.67746106]])

In [45]:
y

array([-166.43549425,   70.30519059,   11.63340556, -114.21469516,
        211.19236177, -108.21659958,  -47.683669  ,   66.81762377])

#### b) Classification

In [46]:
X1 , y1= datasets.make_classification(n_samples=10, n_features=5, n_classes=2, random_state=54)
X1

array([[-1.3743581 , -0.52665347,  0.02273785,  0.64178611, -0.93058765],
       [ 1.26503243,  1.4114587 , -0.64975823, -0.38450394,  0.33216614],
       [-0.96278708, -1.97089887,  1.10296862,  0.37093242,  0.25459983],
       [-1.16327786,  1.330538  , -1.18610031,  1.15239809, -1.79283189],
       [-0.65753607,  0.10623327, -0.23218549,  1.79220363, -0.6479195 ],
       [ 1.28721028, -1.10688289,  1.06451032,  0.3282265 ,  1.77706034],
       [-0.88485538,  0.68116475, -0.67766458, -0.42146135, -1.17647145],
       [ 1.74727704, -1.71655512,  1.59023347,  0.81676153,  2.53333593],
       [-0.83570468,  0.65958238, -0.6510521 ,  0.12023732, -1.12032022],
       [ 0.04402878, -1.62476802,  1.11323753,  0.60976223,  0.95877628]])

In [47]:
y1

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0])