<a href="https://colab.research.google.com/github/Mohit-Jangid/Random-Stuff/blob/main/Machine_Learning_Basics_Part_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1818188%2F2965537%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240704%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240704T150332Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1f472a0989e157bd81c5fcf079369d40a78d24b3e3abe9d2e4e05f8748740881f556303817510d9ea2b791607e1f26e13b7819dc1980c8884da464c8acaa12c912ba1256bdcdfc8de582d92342037cdaa2b203389ad96b192f53f3009433d7e782d41a3936982f3c42d62195c1a20ff6c8039c1cd657b1ccf891c38e2e97bc5b13729608a6f87d76e7fb83c8c32cc0352324606611763bf6dc8b15236414092ec0a3dce528f35e77f27063313c91a8a6916c85ba0d59be04dc1ec3efc06ef801986bae6cf417ad085f3d83e47a66c2f80bd111cfabe5a2b3ced51de000aef2c3792d66bd34d296034e1dfe1eeef9024344ce9cee8529bab3328bc17802ac95e3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Follow for content related to Machine Learning Link:   https://www.linkedin.com/groups/7436898/

# Check Python Library Version

In [None]:
# Check Python Library Version
import pandas as pd
import numpy as np
import seaborn as sns

print('numpy:{}'.format(np.__version__))
print('pandas:{}'.format(pd.__version__))
print('seaborn:{}'.format(sns.__version__))

# Read CSV File

In [None]:
# Read CSV File
import pandas as pd
df = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
df.head().style.set_properties(**{'background-color': 'Black',
                           'color': 'white',
                           'border-color': 'darkblack'})

# Drop Columns from the dataframe

In [None]:
# Drop Columns from the dataframe
df_drop_Col = df.drop(['PassengerId','Name','Ticket'],axis=1)
df_drop_Col.head()

# Drop Rows from the dataframe

In [None]:
# Drop Rows from the dataframe
df_drop_rows = df.drop(labels=[0,1,3,6],axis=0)
df_drop_rows.head()

# Describe function

In [None]:
df.describe()

In [None]:
df[df['Survived']==0].describe().T.style.background_gradient \
(subset=['mean','std','50%','count'], cmap='RdPu')

In [None]:
df.describe(percentiles=[0.05,0.25,0.35,0.5,0.75,0.85,0.95,0.995,0.999])

# Correlation function

In [None]:
df[['Survived', 'Pclass', 'Age', 'SibSp','Parch', 'Fare']].corr()

In [None]:
corr = df.groupby(["Embarked"])[["Fare" , "Age"]].corr()
corr.head()

sns.heatmap(corr , annot=True , fmt =".2f")
plt.xlabel("Vars")
plt.ylabel("age category")
plt.title("corr between age and Fare according to Embarked (catgorey)")
plt.show()

In [None]:
X = df[['Pclass', 'Age', 'SibSp','Parch', 'Fare']]
y = df['Survived']
X.corrwith(y).plot.bar(
        figsize = (16, 4), title = "Correlation with Titanic", fontsize = 15,
        rot = 90, grid = True)
plt.show()

In [None]:
corr = df[['Pclass', 'Age', 'SibSp','Parch', 'Fare']].corr()
plt.figure(dpi=100)
plt.title('Correlation Analysis')
sns.heatmap(corr,annot=True,lw=0,linecolor='white',cmap='viridis',fmt = "0.2f")
plt.xticks(rotation=90)
plt.yticks(rotation = 0)
plt.show()

In [None]:
corr = df[['Pclass', 'Age', 'SibSp','Parch', 'Fare']].corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
plt.figure(dpi=100)
plt.title('Correlation Analysis')
sns.heatmap(corr,mask=mask,annot=True,lw=0,linecolor='white',cmap='viridis',fmt = "0.2f")
plt.xticks(rotation=90)
plt.yticks(rotation = 0)
plt.show()

# Aggregate function

In [None]:
df[['Age','Fare','Pclass']].agg(['sum','max','mean','std','skew','kurt'])

In [None]:
df.groupby("Embarked").agg({"Fare": np.mean, "Sex": np.size})

In [None]:
df.groupby(["Embarked", "Pclass"]).agg({"Fare": [np.size, np.mean]})

In [None]:
df.pivot_table(index='Pclass',values='Age', aggfunc=np.median)

In [None]:
x=pd.DataFrame(pd.pivot_table(df,index=['Sex','Embarked'],aggfunc='count')['Fare'])
x

In [None]:
df.groupby(["Embarked", "Pclass"]).agg({"Fare": [np.size, np.mean]})

In [None]:
df.groupby("Embarked").agg({"Fare": np.mean, "Sex": np.size})

In [None]:
df.groupby('Embarked').Fare.agg(['count', 'mean', 'sum']).sort_values('sum').tail()

# value_counts function

In [None]:
df['Embarked'].value_counts().to_frame()

In [None]:
df['Embarked'].value_counts().tolist()

In [None]:
males = df[df['Sex'] == 'male']
males['Survived'].value_counts()/len(males)

In [None]:
df['Embarked'][df['Sex']=='female'].value_counts(normalize=True)*100

# fillna method

In [None]:
titanic = df.copy('deep')

In [None]:
titanic.fillna(method="ffill", inplace=True)

In [None]:
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())

In [None]:
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'] == 'Q')

In [None]:
titanic['Embarked'].fillna("C", inplace=True)

In [None]:
titanic.fillna({'Embarked' : 'Z'},inplace=True)

In [None]:
titanic['Embarked'] = titanic['Embarked'].fillna('S')

In [None]:
titanic = titanic['Embarked'].fillna(value='No Passenger')

In [None]:
titanic = df.copy('deep')
titanic.head()

# Follow for content related to Machine Learning Link:   https://www.linkedin.com/groups/7436898/