<a href="https://colab.research.google.com/github/Mohit-Jangid/Pandas/blob/main/Pandas_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'tipping:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F137031%2F324803%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240701%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240701T184032Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db2b93d63641ba8ff912c93ef56531e9a910d8fbfecf88fb3fe3d9f9d87a1248eb71da2d1b76f0d5695aeb44cf9ebd93486dae1126b7a2caa66b6ce7411fc971caba9c3af0e554034d71ce96c7793fa0ab631ada997f9f44d1df4e74306277f8268241eeeb3446051e7cf5c3ad8a11c830ae5416a88f7e188a6a545324ea6651db85ba4c57bd6f73bc878b80cc677fca0d41b84c5311f9b5b881f90da700538222680701ac21c06b1ef29f88218638b80955633f7668ada7a3386ed40d5b4f59e74b29f685e6b6a65d0322bd9f2a46e6c588de9b296f74f26a4614932ea6d86c4c21f2d3901fcbd8cdfda6a8d824a9180d11c0f3b159b2daf7a084ba20afef9ab'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<div style="color:white;background-color:Black;padding:3%;border-radius:150px 150px;font-size:2.5em;text-align:center">Pandas toolkit Part 5</div>

<center>
<img src="https://i1.wp.com/www.datascienceexamples.com/wp-content/uploads/2019/10/python-and-pandas.jpg?resize=800%2C286&ssl=1" width=1200>
</center>

In [None]:
import pandas as pd
import numpy as np

In [None]:
url = ("https://raw.github.com/pandas-dev""/pandas/main/pandas/tests/io/data/csv/tips.csv")
tips = pd.read_csv(url)
tips.head()

In [None]:
tips[["total_bill", "tip", "smoker", "time"]]

In [None]:
tips.assign(tip_rate=tips["tip"] / tips["total_bill"])

In [None]:
is_dinner = tips["time"] == "Dinner"
is_dinner

In [None]:
is_dinner.value_counts()

In [None]:
tips[is_dinner]

In [None]:
tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]

In [None]:
tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]

In [None]:
tips.groupby("sex").size()

In [None]:
tips.groupby("sex").count()

In [None]:
tips.groupby("sex")["total_bill"].count()

In [None]:
tips.groupby("day").agg({"tip": np.mean, "day": np.size})

In [None]:
tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})

In [None]:
tips.nlargest(10 + 5, columns="tip").tail(2)

In [None]:
(
tips.assign(
rn=tips.sort_values(["total_bill"], ascending=False)
.groupby(["day"])
.cumcount()
+ 1
)
.query("rn < 3")
.sort_values(["day", "rn"])
)

In [None]:
(
tips.assign(
rnk=tips.groupby(["day"])["total_bill"].rank(
method="first", ascending=False
)
)
.query("rnk < 3")
.sort_values(["day", "rnk"])
)

In [None]:
(
tips[tips["tip"] < 2]
.assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min"))
.query("rnk_min < 3")
.sort_values(["sex", "rnk_min"])
)

In [None]:
tips.loc[tips["tip"] < 2, "tip"] *= 2

In [None]:
tips = tips.loc[tips["tip"] <= 9]

In [None]:
tips = pd.read_csv("/kaggle/input/tipping/tips.csv", sep="\t", header=None)
# alternatively, read_table is an alias to read_csv with tab delimiter
tips = pd.read_table("/kaggle/input/tipping/tips.csv", header=None)

In [None]:
tips.to_excel("./tips.xlsx")

In [None]:
tips_df = pd.read_excel("./tips.xlsx", index_col=0)

In [None]:
tips.head(5)

In [None]:
tips = pd.read_csv("/kaggle/input/tipping/tips.csv", sep="\t", header=None)
# alternatively, read_table is an alias to read_csv with tab delimiter
tips = pd.read_table("/kaggle/input/tipping/tips.csv", header=None)
tips.head()

In [None]:
url = ("https://raw.github.com/pandas-dev""/pandas/main/pandas/tests/io/data/csv/tips.csv")

tips = pd.read_csv(url)
tips.head()

In [None]:
tips["total_bill"] = tips["total_bill"] - 2
tips["new_bill"] = tips["total_bill"] / 2
tips.head()

In [None]:
tips = tips.drop("new_bill", axis=1)

In [None]:
tips[tips["total_bill"] > 10]

In [None]:
tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
tips

In [None]:
tips["date1"] = pd.Timestamp("2013-01-15")
tips["date2"] = pd.Timestamp("2015-02-15")
tips["date1_year"] = tips["date1"].dt.year
tips["date2_month"] = tips["date2"].dt.month
tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
tips["months_between"] = tips["date2"].dt.to_period("M") - tips["date1"].dt.to_period("M")

tips[["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]]

In [None]:
tips[["sex", "total_bill", "tip"]]

In [None]:
tips.drop("sex", axis=1)

In [None]:
tips.rename(columns={"total_bill": "total_bill_2"})

In [None]:
tips = tips.sort_values(["sex", "total_bill"])
tips.head(2)

In [None]:
tips["time"].str.len()

In [None]:
tips["time"].str.rstrip().str.len()

In [None]:
tips["sex"].str.find("ale")

In [None]:
tips["sex"].str[0:1]

In [None]:
pd.pivot_table(tips, values="tip", index=["size"], columns=["sex"], aggfunc=np.average)

In [None]:
tips.iloc[1:2,0:3]

In [None]:
tips == "3.75"

In [None]:
tips["day"].str.contains("S")

In [None]:
tips.replace("Thu", "Thursday")

In [None]:
tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum()
tips_summed

In [None]:
gb = tips.groupby("smoker")["total_bill"]
tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean")
tips.head(2)

In [None]:
tips.groupby(["sex", "smoker"]).first()

In [None]:
url = ("https://raw.github.com/pandas-dev""/pandas/main/pandas/tests/io/data/csv/tips.csv")

tips = pd.read_csv(url)
tips.head(2)

In [None]:

tips[tips["total_bill"] > 10]

In [None]:
tips[["sex", "total_bill", "tip"]]

In [None]:
tips = tips.sort_values(["sex", "total_bill"])
tips

In [None]:
print(tips.iloc[-20:, :12].to_string())