In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error

from joblib import dump,load
import plotly.express as px

In [4]:
df =pd.read_csv("../sample.csv")

In [5]:
df.head()

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,1.0,shabeeh,23.0,"$35,000",manager,9867164000.0
1,2.0,kashish,,"$55,000",manager,
2,3.0,nisha shukla,28.0,"$12,457",region manager,9857612000.0
3,4.0,deepak,24.0,"$54,887",assistant manager,
4,5.0,mayank,25.0,"$55,248",managaer,884751300.0


In [6]:
df.isnull()#to check the value of if true or false
df.notnull()

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,True,True,True,True,True,True
1,True,True,False,True,True,False
2,True,True,True,True,True,True
3,True,True,True,True,True,False
4,True,True,True,True,True,True
5,True,True,False,True,False,True
6,False,False,False,False,False,False


In [7]:
df.isnull().sum()#for check the no of null

Sno            1
name           1
age            3
salary         1
degisnation    2
Mobile         3
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sno          6 non-null      float64
 1   name         6 non-null      object 
 2   age          4 non-null      float64
 3   salary       6 non-null      object 
 4   degisnation  5 non-null      object 
 5   Mobile       4 non-null      float64
dtypes: float64(3), object(3)
memory usage: 464.0+ bytes


In [9]:
#DATA CLEANING

In [10]:
df

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,1.0,shabeeh,23.0,"$35,000",manager,9867164000.0
1,2.0,kashish,,"$55,000",manager,
2,3.0,nisha shukla,28.0,"$12,457",region manager,9857612000.0
3,4.0,deepak,24.0,"$54,887",assistant manager,
4,5.0,mayank,25.0,"$55,248",managaer,884751300.0
5,6.0,aditya sagar,,"$75,474",,81166150000.0
6,,,,,,


In [11]:
def salary_clean(salary):
    if isinstance(salary,str):
        salary=salary.strip()
        if "$" in salary :
            salary = salary.replace('$','')
        if "," in salary:
            salary = salary.replace(',','')
        return int(salary)
    return salary


In [12]:
df ['salary'].apply(salary_clean)

0    35000.0
1    55000.0
2    12457.0
3    54887.0
4    55248.0
5    75474.0
6        NaN
Name: salary, dtype: float64

In [13]:
df['salary']=df['salary'].apply(salary_clean)
df

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,1.0,shabeeh,23.0,35000.0,manager,9867164000.0
1,2.0,kashish,,55000.0,manager,
2,3.0,nisha shukla,28.0,12457.0,region manager,9857612000.0
3,4.0,deepak,24.0,54887.0,assistant manager,
4,5.0,mayank,25.0,55248.0,managaer,884751300.0
5,6.0,aditya sagar,,75474.0,,81166150000.0
6,,,,,,


In [14]:
df

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,1.0,shabeeh,23.0,35000.0,manager,9867164000.0
1,2.0,kashish,,55000.0,manager,
2,3.0,nisha shukla,28.0,12457.0,region manager,9857612000.0
3,4.0,deepak,24.0,54887.0,assistant manager,
4,5.0,mayank,25.0,55248.0,managaer,884751300.0
5,6.0,aditya sagar,,75474.0,,81166150000.0
6,,,,,,


In [15]:
#don not dothis ever
df.dropna()

Unnamed: 0,Sno,name,age,salary,degisnation,Mobile
0,1.0,shabeeh,23.0,35000.0,manager,9867164000.0
2,3.0,nisha shukla,28.0,12457.0,region manager,9857612000.0
4,5.0,mayank,25.0,55248.0,managaer,884751300.0


In [16]:
#test run
df.drop(['Mobile','Sno','name'],axis=1,inplace=True)
df


Unnamed: 0,age,salary,degisnation
0,23.0,35000.0,manager
1,,55000.0,manager
2,28.0,12457.0,region manager
3,24.0,54887.0,assistant manager
4,25.0,55248.0,managaer
5,,75474.0,
6,,,


#we remove null ( missing )values by using scikit-learn libaray 
#applymap() use to check the value in whole data table 

In [17]:
df

Unnamed: 0,age,salary,degisnation
0,23.0,35000.0,manager
1,,55000.0,manager
2,28.0,12457.0,region manager
3,24.0,54887.0,assistant manager
4,25.0,55248.0,managaer
5,,75474.0,
6,,,


In [18]:
from sklearn.impute import SimpleImputer

#scikit-learn
class instantiate(object create)
fit() the data into the object
transform() to get the result


fit_transform() to pass the data and get the result directly

In [19]:
imputer = SimpleImputer()




In [20]:
cols = ['age','salary']
imputer.fit_transform(df[cols])

array([[2.3000e+01, 3.5000e+04],
       [2.5000e+01, 5.5000e+04],
       [2.8000e+01, 1.2457e+04],
       [2.4000e+01, 5.4887e+04],
       [2.5000e+01, 5.5248e+04],
       [2.5000e+01, 7.5474e+04],
       [2.5000e+01, 4.8011e+04]])

In [21]:
cols = ['age','salary']
df[cols] = imputer.fit_transform(df[cols])
df[cols]

Unnamed: 0,age,salary
0,23.0,35000.0
1,25.0,55000.0
2,28.0,12457.0
3,24.0,54887.0
4,25.0,55248.0
5,25.0,75474.0
6,25.0,48011.0


df['degisnation'].value_count()

In [22]:
df['degisnation'].value_counts()

manager              2
region manager       1
assistant manager    1
managaer             1
Name: degisnation, dtype: int64

[[]] use to create oned array to 2d array

In [23]:
imp2 = SimpleImputer(strategy='most_frequent')
imp2.fit_transform(df[['degisnation']])

array([['manager'],
       ['manager'],
       ['region manager'],
       ['assistant manager'],
       ['managaer'],
       ['manager'],
       ['manager']], dtype=object)

In [24]:
df

Unnamed: 0,age,salary,degisnation
0,23.0,35000.0,manager
1,25.0,55000.0,manager
2,28.0,12457.0,region manager
3,24.0,54887.0,assistant manager
4,25.0,55248.0,managaer
5,25.0,75474.0,
6,25.0,48011.0,


In [25]:
df['degisnation'] = imp2.fit_transform(df[['degisnation']])

.ndim() are use to check the data matrix
and shape() is just check the data array

In [26]:
type(df[['degisnation']])

pandas.core.frame.DataFrame

In [27]:
df

Unnamed: 0,age,salary,degisnation
0,23.0,35000.0,manager
1,25.0,55000.0,manager
2,28.0,12457.0,region manager
3,24.0,54887.0,assistant manager
4,25.0,55248.0,managaer
5,25.0,75474.0,manager
6,25.0,48011.0,manager


In [28]:
gender =['male','female']*3+['male']

In [29]:
gender

['male', 'female', 'male', 'female', 'male', 'female', 'male']

In [30]:
df['Gender']=gender

In [31]:
df

Unnamed: 0,age,salary,degisnation,Gender
0,23.0,35000.0,manager,male
1,25.0,55000.0,manager,female
2,28.0,12457.0,region manager,male
3,24.0,54887.0,assistant manager,female
4,25.0,55248.0,managaer,male
5,25.0,75474.0,manager,female
6,25.0,48011.0,manager,male


renaming  the column 

In [32]:
df.rename({'salary':'pay'},axis=1 , inplace=True)

In [33]:
df

Unnamed: 0,age,pay,degisnation,Gender
0,23.0,35000.0,manager,male
1,25.0,55000.0,manager,female
2,28.0,12457.0,region manager,male
3,24.0,54887.0,assistant manager,female
4,25.0,55248.0,managaer,male
5,25.0,75474.0,manager,female
6,25.0,48011.0,manager,male


In [34]:
row =[{
    'age':19,
    'pay':2252125, 
    'degisnation':'manager',
    'gender':'male'}]

In [35]:
df = df.append(row,ignore_index=True)

  df = df.append(row,ignore_index=True)


In [36]:
df

Unnamed: 0,age,pay,degisnation,Gender,gender
0,23.0,35000.0,manager,male,
1,25.0,55000.0,manager,female,
2,28.0,12457.0,region manager,male,
3,24.0,54887.0,assistant manager,female,
4,25.0,55248.0,managaer,male,
5,25.0,75474.0,manager,female,
6,25.0,48011.0,manager,male,
7,19.0,2252125.0,manager,,male


#data should be numerical in scikit-learn

lable_encoder

#onehotencoder(dummy variable)

In [37]:
df['Gender'].value_counts()
#their we use lableencoder

male      4
female    3
Name: Gender, dtype: int64

In [38]:
from sklearn.preprocessing import LabelEncoder

In [39]:
Gender_enc = LabelEncoder()
Gender_enc.fit(df['Gender'])

LabelEncoder()

In [40]:
Gender_enc.transform(df['Gender'])

array([1, 0, 1, 0, 1, 0, 1, 2])

In [41]:
df['Gender']=Gender_enc.transform(df['Gender'])
df

Unnamed: 0,age,pay,degisnation,Gender,gender
0,23.0,35000.0,manager,1,
1,25.0,55000.0,manager,0,
2,28.0,12457.0,region manager,1,
3,24.0,54887.0,assistant manager,0,
4,25.0,55248.0,managaer,1,
5,25.0,75474.0,manager,0,
6,25.0,48011.0,manager,1,
7,19.0,2252125.0,manager,2,male


In [42]:
df.drop(['gender'],axis=1,inplace=True)

In [43]:
df

Unnamed: 0,age,pay,degisnation,Gender
0,23.0,35000.0,manager,1
1,25.0,55000.0,manager,0
2,28.0,12457.0,region manager,1
3,24.0,54887.0,assistant manager,0
4,25.0,55248.0,managaer,1
5,25.0,75474.0,manager,0
6,25.0,48011.0,manager,1
7,19.0,2252125.0,manager,2


In [44]:
Gender_enc.classes_

array(['female', 'male', nan], dtype=object)

In [45]:
Gender_enc.inverse_transform([1,0])

array(['male', 'female'], dtype=object)

# 1- feb-2022

*** multiple valuse in dada so use onehotencoder

In [46]:
df

Unnamed: 0,age,pay,degisnation,Gender
0,23.0,35000.0,manager,1
1,25.0,55000.0,manager,0
2,28.0,12457.0,region manager,1
3,24.0,54887.0,assistant manager,0
4,25.0,55248.0,managaer,1
5,25.0,75474.0,manager,0
6,25.0,48011.0,manager,1
7,19.0,2252125.0,manager,2


In [47]:
df['degisnation'].value_counts()

manager              5
region manager       1
assistant manager    1
managaer             1
Name: degisnation, dtype: int64

In [48]:
df['degisnation'].unique()

array(['manager', 'region manager', 'assistant manager', 'managaer'],
      dtype=object)

OneHotecnoder (dummy variable creation) 


state                       amount
*************************************


newyork                     475
sanfranscios                47578
newyork                     6658
sanfranscios                17626
newyork                     25644
sanfranscios                71189
newyork                     24166
sanfranscios                77457
newyork                     42537   
sanfranscios                81611


In [49]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
desig_enc=OneHotEncoder(drop='first')
desig_enc.fit(df[['degisnation']])

OneHotEncoder(drop='first')

use the "toarray()"

In [51]:
dummy_d=desig_enc.transform(df[['degisnation']]).toarray()


#.numpy matrix  create the toarray()

In [52]:
dummy_d

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [53]:
df.drop(columns=["degisnation"],axis=1,inplace=True)

In [54]:
df.values

array([[2.300000e+01, 3.500000e+04, 1.000000e+00],
       [2.500000e+01, 5.500000e+04, 0.000000e+00],
       [2.800000e+01, 1.245700e+04, 1.000000e+00],
       [2.400000e+01, 5.488700e+04, 0.000000e+00],
       [2.500000e+01, 5.524800e+04, 1.000000e+00],
       [2.500000e+01, 7.547400e+04, 0.000000e+00],
       [2.500000e+01, 4.801100e+04, 1.000000e+00],
       [1.900000e+01, 2.252125e+06, 2.000000e+00]])

In [55]:
import numpy as np

In [56]:
data=np.hstack((dummy_d,df.values))
data.shape

(8, 6)

In [57]:
#option2 for the bettter version
dummy_df=pd.DataFrame(dummy_d)

In [58]:
data_df=pd.concat([dummy_df,df],axis=1)
data_df

Unnamed: 0,0,1,2,age,pay,Gender
0,0.0,1.0,0.0,23.0,35000.0,1
1,0.0,1.0,0.0,25.0,55000.0,0
2,0.0,0.0,1.0,28.0,12457.0,1
3,0.0,0.0,0.0,24.0,54887.0,0
4,1.0,0.0,0.0,25.0,55248.0,1
5,0.0,1.0,0.0,25.0,75474.0,0
6,0.0,1.0,0.0,25.0,48011.0,1
7,0.0,1.0,0.0,19.0,2252125.0,2


#Scaling the data 
******************************
MINMAX SCALING
STANDERD SCALING

In [59]:
a=np.array([66,55,67,584,475,58766,324])

In [60]:
a

array([   66,    55,    67,   584,   475, 58766,   324])

In [61]:
a.max()

58766

In [62]:
a/a.max()

array([1.12309839e-03, 9.35915325e-04, 1.14011503e-03, 9.93771909e-03,
       8.08290508e-03, 1.00000000e+00, 5.51339210e-03])

In [63]:
from sklearn.preprocessing import StandardScaler ,MinMaxScaler

In [64]:
mxscaler = MinMaxScaler()
mxscaler.fit(dummy_df)
scaled_data=mxscaler.transform(dummy_df)
print(scaled_data)

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [65]:
stdscaler= StandardScaler()
stdscaler.fit(dummy_df)
scaled_data1=stdscaler.transform(dummy_df)
print(scaled_data1)


[[-0.37796447  0.77459667 -0.37796447]
 [-0.37796447  0.77459667 -0.37796447]
 [-0.37796447 -1.29099445  2.64575131]
 [-0.37796447 -1.29099445 -0.37796447]
 [ 2.64575131 -1.29099445 -0.37796447]
 [-0.37796447  0.77459667 -0.37796447]
 [-0.37796447  0.77459667 -0.37796447]
 [-0.37796447  0.77459667 -0.37796447]]


#slope calculation
1. supervised ml algo
****features and input variables  "x"
{output response or label }   "Y"
"if predticion contains changing numerical values "{REGRESSION}
' if predction contains similar categorical values'[CLASSIFICATION]***

#***************************supervised regression algorithms*************************
*************            . linear regression 
                            .smiple linear regression
                             . multiple linear regression
                             .ploynomial linear regressio
                        1. descision tree algo
                        2. random forest algo (ensemble)
                        3.SVM  (sipport vector machine) 
                        4.XBboost

                    
*************************