<a href="https://colab.research.google.com/github/Mohit-Jangid/Pandas/blob/main/Pandas_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240629%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240629T081602Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7efcebc9efa7bfa3e47c6aa61a58a01861a66f79917d9ff0d699b5be1f0bdcd811fd0c237bb7f2142a5251ffe9a626760bd2325e350979d6b89ca65bd56e7dbfbf75bff6c7460cd7628803bcf1fd571bf469e321d7573f8e060baf7ad92bb4aaaeb489fa84ce747447c96ed6e8c3b16bf70b2bf8776a4e3c3d9ebaa2062fe16a25f411d04f756b52c27a8b44402df5bf853625c1cc65ec2257d6baa0d720c587ddf027e0ac5cb5d5f243fdba4a14c28038f0949248aef3e155e123953403979dec0250a9e73874e2278ffe7669dfbf8a69ad68341011326b2f9b00865471c4e43cfdeed12304a2123daf83bcb03a9eefadac6d3c6d056b0e660dca5c6b5b5898'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<div style="color:white;background-color:Black;padding:3%;border-radius:150px 150px;font-size:2.5em;text-align:center">Basics Pandas toolkit Part 1</div>

<center>
<img src="https://i1.wp.com/www.datascienceexamples.com/wp-content/uploads/2019/10/python-and-pandas.jpg?resize=800%2C286&ssl=1" width=1200>
</center>


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.DataFrame({

 "Name": ["Braund, Mr. Owen Harris","Allen, Mr. William Henry","Bonnell, Miss. Elizabeth"],

 "Age": [22, 35, 58],

 "Sex": ["male", "male", "female"]

})

df

In [None]:
df["Age"]

In [None]:
ages = pd.Series([22, 35, 58], name="Age")
ages

In [None]:
df["Age"].max()

In [None]:
ages.max()

In [None]:
df.describe()

In [None]:
titanic = pd.read_csv("/kaggle/input/titanic/train.csv")
titanic.head()

In [None]:
titanic.dtypes

In [None]:
titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)

In [None]:
titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers")

In [None]:
titanic.info()

In [None]:
ages = titanic["Age"]
ages.head()

In [None]:
type(titanic["Age"])

In [None]:
titanic["Age"].shape

In [None]:
titanic["Age"].shape

In [None]:
age_sex = titanic[["Age", "Sex"]]
age_sex.head()

In [None]:
titanic[["Age", "Sex"]].shape

In [None]:
above_35 = titanic[titanic["Age"] > 35]
above_35.head()

In [None]:
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23.head()

In [None]:
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23.head()

In [None]:
age_no_na = titanic[titanic["Age"].notna()]
age_no_na.head()

In [None]:
adult_names = titanic.loc[titanic["Age"] > 35]
adult_names.head()

In [None]:
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names.head()

In [None]:
titanic.iloc[9:25, 2:5]

In [None]:
anon = titanic.iloc[0:3, 3] = "anonymous"
anon

In [None]:
titanic.head()

In [None]:
titanic["Age"].mean()

In [None]:
titanic[["Age", "Fare"]].median()

In [None]:
titanic[["Age", "Fare"]].describe()

In [None]:
titanic.agg({

"Age": ["min", "max", "median", "skew"],
"Fare": ["min", "max", "median", "mean"]

})

In [None]:
titanic[["Sex", "Age"]].groupby("Sex").mean()

In [None]:
titanic[["Sex", "Age"]].groupby("Sex").max()

In [None]:
titanic[["Sex", "Age"]].groupby("Sex").first()

In [None]:
titanic.head(2)

In [None]:
titanic.groupby("Sex")["Age"].mean()

In [None]:
titanic.groupby(["Sex", "Pclass"])["Fare"].mean()

In [None]:
titanic["Pclass"].value_counts()

In [None]:
titanic.groupby("Pclass")["Pclass"].count()

In [None]:
titanic.sort_values(by="Age",ascending=False).head()

In [None]:
titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head()

In [None]:
titanic.dtypes

In [None]:
titanic["Name"].str.lower()

In [None]:
titanic["Name"].str.split(",")

In [None]:
titanic["Surname"] = titanic["Name"].str.split(",").str.get(0)
titanic["Surname"]

In [None]:
titanic["Name_main"] = titanic["Name"].str.split(",").str.get(1)
titanic["Name_main"]

In [None]:
titanic["Name"].str.split(",")

In [None]:
titanic['Real_Name'] = titanic["Name"].str.split(",").str.get(0)
titanic.head()

In [None]:
titanic['Surname'] = titanic["Name"].str.split(",").str.get(1)
titanic.head()

In [None]:
titanic['Salutation'] = titanic['Surname'].str.split(".").str.get(0)
titanic.head()

In [None]:
titanic["Name"].str.contains("Mr")

In [None]:
titanic[titanic["Name"].str.contains("Countess")]

In [None]:
titanic["Name"].str.len()

In [None]:
titanic["Name"].str.len().idxmax()

In [None]:
titanic.loc[titanic["Name"].str.len().idxmax(), "Name"]

In [None]:
titanic.loc[titanic["Name"].str.len().idxmin(), "Name"]

In [None]:
titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"})
titanic["Sex_short"]

In [None]:
titanic["Sex_short"] = titanic["Sex"].str.replace("female", "F")
titanic["Sex_short"] = titanic["Sex_short"].str.replace("male", "M")

In [None]:
import numpy as np
df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc"))
df[["a", "c","b"]]

In [None]:
df.loc[:, ["a", "c"]]

<font color="Red" size=+2 face="Comic Sans MS">Good Code</font>

In [None]:
named = list("abcdefg")
n = 30
columns = named + np.arange(len(named), n).tolist()
df = pd.DataFrame(np.random.randn(n, n), columns=columns)
df.iloc[:, np.r_[:10, 24:30]]

In [None]:
df = pd.DataFrame({

"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
"by2": ["wet","dry",99,95,np.nan,"damp",95,99,"red",99,np.nan,np.nan,]

})

df

In [None]:
g = df.groupby(["by1", "by2"])
g[["v1", "v2"]].mean()

In [None]:
import numpy as np
s = pd.Series(np.arange(5), dtype=np.float32)
s

In [None]:
s.isin([2, 4])

<font color="Red" size=+2 face="Comic Sans MS">Data generation code</font>

In [None]:
# Data genetarion code

import random
import string

baseball = pd.DataFrame({
"team": ["team %d" % (x + 1) for x in range(5)] * 5,
"player": random.sample(list(string.ascii_lowercase), 25),
"batting avg": np.random.uniform(0.200, 0.400, 25),
    }
)

baseball

In [None]:
baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max)

In [None]:
df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)})
df.head()

In [None]:
df.query("a <= b")

In [None]:
df[df["a"] <= df["b"]]

In [None]:
df.loc[df["a"] <= df["b"]]

In [None]:
df[df["a"] >= df["b"]]

In [None]:
df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)})
df.head()

In [None]:
df.eval("a + b")

In [None]:
df["a"] + df["b"]

In [None]:
df = pd.DataFrame({

"x": np.random.uniform(1.0, 168.0, 120),
"y": np.random.uniform(7.0, 334.0, 120),
"z": np.random.uniform(1.7, 20.7, 120),
"month": [5, 6, 7, 8] * 30,
"week": np.random.randint(1, 4, 120)

})

df.head()

In [None]:
grouped = df.groupby(["month", "week"])
grouped["x"].agg([np.mean, np.std])

In [None]:
a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4)
a

In [None]:
pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)])

In [None]:
a = list(enumerate(list(range(1, 5)) + [np.NAN]))
a

In [None]:
pd.DataFrame(a)

In [None]:
cheese = pd.DataFrame({

"first": ["John", "Mary"],
"last": ["Doe", "Bo"],
"height": [5.5, 6.0],
"weight": [130, 150]

})

cheese

In [None]:
pd.melt(cheese, id_vars=["first", "last"])

In [None]:
cheese.set_index(["first", "last"]).stack() # alternative

In [None]:
df = pd.DataFrame({

"x": np.random.uniform(1.0, 168.0, 12),
"y": np.random.uniform(7.0, 334.0, 12),
"z": np.random.uniform(1.7, 20.7, 12),
"month": [5, 6, 7] * 4,
"week": [1, 2] * 6

})

mdf = pd.melt(df, id_vars=["month", "week"])

pd.pivot_table(mdf,values="value",index=["variable", "week"],columns=["month"],aggfunc=np.mean,)

In [None]:
df = pd.DataFrame({

"Animal": ["Animal1","Animal2","Animal3","Animal2","Animal1","Animal2","Animal3",],
"FeedType": ["A", "B", "A", "A", "B", "B", "A"],
"Amount": [10, 7, 4, 2, 5, 6, 2]

})

df.pivot_table(values="Amount", index="Animal", columns="FeedType", aggfunc="sum")

In [None]:
df.groupby(["Animal", "FeedType"])["Amount"].sum()

In [None]:
pd.cut(pd.Series([1, 2, 3, 4, 5, 6]), 3)

In [None]:
pd.Series([1, 2, 3, 2, 2, 3]).astype("category")

In [None]:
frame = pd.DataFrame({"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]})

frame

In [None]:
frame[frame["col2"].isna()]

In [None]:
frame[frame["col1"].notna()]

In [None]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

In [None]:
pd.merge(df1, df2, on="key")

In [None]:
indexed_df2 = df2.set_index("key")
pd.merge(df1, indexed_df2, left_on="key", right_index=True)

In [None]:
pd.merge(df1, df2, on="key", how="left")

In [None]:
pd.merge(df1, df2, on="key", how="right")

In [None]:
pd.merge(df1, df2, on="key", how="outer")

In [None]:
df1 = pd.DataFrame({"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)})

df2 = pd.DataFrame({"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]})


pd.concat([df1, df2])

In [None]:
pd.concat([df1, df2]).drop_duplicates()

In [None]:
df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
df

In [None]:
firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]})
firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0]
firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[1]
firstlast

In [None]:
firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]})
firstlast["upper"] = firstlast["string"].str.upper()
firstlast["lower"] = firstlast["string"].str.lower()
firstlast["title"] = firstlast["string"].str.title()
firstlast

In [None]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df1

In [None]:
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})
df2

In [None]:
inner_join = df1.merge(df2, on=["key"], how="inner")
inner_join

In [None]:
left_join = df1.merge(df2, on=["key"], how="left")
left_join

In [None]:
right_join = df1.merge(df2, on=["key"], how="right")
right_join

In [None]:
outer_join = df1.merge(df2, on=["key"], how="outer")
outer_join

In [None]:
df = pd.DataFrame({"AAA": [1] * 8, "BBB": list(range(0, 8))})
df

In [None]:
series = list(range(1, 5))
series

In [None]:
df.loc[2:5, "AAA"] = series
df

In [None]:
df = pd.DataFrame({

"class": ["A", "A", "A", "B", "C", "D"],
"student_count": [42, 35, 42, 50, 47, 45],
"all_pass": ["Yes", "Yes", "Yes", "No", "No", "Yes"]

})

df.drop_duplicates()

In [None]:
df.drop_duplicates(["class", "student_count"])

In [None]:
new_row = pd.DataFrame([["E", 51, True]],columns=["class", "student_count", "all_pass"])
pd.concat([df, new_row])

In [None]:
df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
df

In [None]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df1

In [None]:
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})
df2

In [None]:
inner_join = df1.merge(df2, on=["key"], how="inner")
inner_join

In [None]:
left_join = df1.merge(df2, on=["key"], how="left")
left_join

In [None]:
right_join = df1.merge(df2, on=["key"], how="right")
right_join

In [None]:
outer_join = df1.merge(df2, on=["key"], how="outer")
outer_join

In [None]:
outer_join["value_x"] + outer_join["value_y"]

In [None]:
outer_join["value_x"].sum()

In [None]:
outer_join[outer_join["value_x"].isna()]

In [None]:
outer_join[outer_join["value_x"].notna()]

In [None]:
outer_join.dropna()

In [None]:
outer_join.fillna(method="ffill")

In [None]:
outer_join["value_x"].fillna(outer_join["value_x"].mean())

In [None]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

In [None]:
dates = pd.date_range("20130101", periods=6)
dates

In [None]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

In [None]:
df2 = pd.DataFrame({

"A": 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo"

})

df2

In [None]:
df2.index

In [None]:
df.to_numpy()

In [None]:
df2.to_numpy()

In [None]:
df.sort_index(axis=1, ascending=False)

In [None]:
df.sort_values(by="B")

In [None]:
df[0:3]

In [None]:
df["20130102":"20130104"]

In [None]:
df.loc[dates[0]]

In [None]:
df.loc[:, ["A", "B"]]

In [None]:
df.loc["20130102":"20130104", ["A", "B"]]

In [None]:
df.loc["20130102", ["A", "B"]]

In [None]:
df.at[dates[0], "A"]

In [None]:
df.iloc[3]

In [None]:
df.iloc[3:5, 0:2]

In [None]:
df.iloc[[1, 2, 4], [0, 2]]

In [None]:
df.iloc[1:3, :]

In [None]:
df.iloc[:, 1:3]

In [None]:
df.iloc[1, 1]

In [None]:
df[df["A"] > 0]

In [None]:
df[df > 0]

In [None]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

In [None]:
df2[df2["E"].isin(["two", "four"])]

In [None]:

s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1

In [None]:
df.at[dates[0], "A"] = 0

In [None]:
df.iat[0, 1] = 0

In [None]:
df.loc[:, "D"] = np.array([5] * len(df))
df

In [None]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

In [None]:
df1.dropna(how="any")

In [None]:
df1.fillna(value=5)

In [None]:
pd.isna(df1)

In [None]:
df.mean()

In [None]:
df.mean(1)

In [None]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

In [None]:
df.sub(s, axis="index")

In [None]:
df.apply(np.cumsum)

In [None]:
df.apply(lambda x: x.max() - x.min())