In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.Series(np.random.randn(10), name='X1')
data

In [None]:
data.index

In [None]:
data.values

In [None]:
dic={'Karachi':10, 'Islamabad':5,'Lahore':1, 'Sukkur':3}
data2=pd.Series(dic, name='X')

In [None]:
data2

In [None]:
data2[1]

In [None]:
data2[1:3]

In [None]:
data=pd.Series(['a','b','c'],index=[2,3,4])
data[0:2] # slicing implicit, indexing explicit

In [None]:
#loc always follow explicit indexing style
data.loc[0:2]

In [None]:
#iloc follow implicit style
data.iloc[0:2]

## Creating a Dataframe 

In [None]:
data=pd.DataFrame(np.random.randn(20,2), columns=['X1','X2'])
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data['X3']=500*(data['X1']>0.5)+100*(data['X1']<=0.5)
data.head()

In [None]:
data['X4']=data['X1']-data['X2']
data.head()

In [None]:
# create a new attribute or column
data['Y']=np.random.randint(-1,2, (20,1))
data.head()

In [None]:
data[data.notnull()]

## check for null values

In [None]:
data.iloc[0,2]=np.nan
np.sum(data.isnull())

In [None]:
data[data.duplicated()]

In [None]:
df=pd.DataFrame([[1,2,np.nan,4],[3,6,7,np.nan],[5,4,3,2,1]])
df.info()

In [None]:
df.dropna(axis=1)

In [None]:
df.dropna(axis=1,how='all')

In [None]:
df.dropna(axis=1,how='any')

In [None]:
df.dropna(axis=0)

In [None]:
df.fillna(0)

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='bfill')

In [None]:
from sklearn import datasets
iris=datasets.load_iris()
iris.data.shape

In [None]:
iris.feature_names

In [None]:
iris.data

In [None]:
iris.target, iris.target_names

In [None]:
df=pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

In [None]:
df['Target']=iris.target
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['Target'].value_counts()

In [None]:
df.columns

In [None]:
plt.figure()
plt.scatter(df['sepal length (cm)'],df['sepal width (cm)'], c=df.Target)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.title('Color represents the true label/target')
plt.colorbar()
plt.figure()
plt.scatter(df['petal length (cm)'],df['petal width (cm)'], c=df.Target)
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title('Color represents the true label/target')
plt.colorbar()

In [None]:
import seaborn as sns
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.jointplot(x='sepal width (cm)', y='sepal length (cm)', hue='Target', data=df)

## Data normalization
Normalize the input attributes (x) such that the new values (x') are scaled and fall in the range [0,1]
$$x'=\frac{x-x_{min}}{x_{max}-x_{min}}$$

In [None]:
data_n=df.drop('Target',axis=1).copy()
data_n=(data_n-data_n.min())/(data_n.max()-data_n.min())
data_n.head()

In [None]:
data_n.describe()

## Standardization
Standardize the input attributes (x) such that the new values (x') have zero mean and unity standard deviation.
$$x'=\frac{x-\mu}{\sigma}$$

In [None]:
data_s=df.drop('Target',axis=1).copy()
data_s=(data_s-data_s.mean())/(data_s.std())
data_s.head()

In [None]:
data_s.describe()

In [None]:
col='petal length (cm)'
plt.figure()
plt.hist(df[col], edgecolor='red',label='original')
plt.hist(data_s[col], edgecolor='red', label='standardized')
plt.legend(loc=0)

plt.figure()
plt.hist(df[col], edgecolor='red',label='original')
plt.hist(data_n[col], edgecolor='red', label='normalized')
plt.legend(loc=0)
