In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

# Import

### CSV

In [None]:
mb = pd.read_csv("Data/microbiome.csv", step=",", index_col=['Patient','Taxon'])

pd.read_csv("Data/microbiome.csv", skiprows=[3,4,6]).head()
pd.read_csv("Data/microbiome.csv", nrows=4)
pd.read_csv("Data/microbiome_missing.csv", na_values=['?', -99999]).head(10)

### Pickle

In [None]:
pd.read_pickle("baseball_pickle")

### Excel

In [None]:
mb = pd.read_excel('Data/microbiome_MID2.xls', sheet_name='Sheet 1', header=None)
mb.head()

# Data to files

In [None]:
# CSV
mb.to_csv("mb.csv")

# Binary
baseball.to_pickle("baseball_pickle")

# Index
You put "index_col"

In [None]:
baseball = pd.read_csv("Data/baseball.csv", index_col='id')
baseball.head()
baseball.describe()
baseball.sample(10)

In [None]:
baseball.index.is_unique

### Look by index
They have an order so you can look for ranges of them

In [None]:
baseball_newind.loc['wickmbo012007']
baseball_newind['womacto01CHN2006':'gonzalu01ARI2006']

# Asign a value to all the columns of the rowns within this range
baseball_newind['womacto01CHN2006':'gonzalu01ARI2006'] = 5


### Index isin

In [None]:
lowest_3_population = COUNTRIES["population_density"].sort_values(ascending=False).head(3)

HAPPINESS[HAPPINESS.index.isin(lowest_3_population.index)]["happiness_score"]

### Create your own

In [None]:
player_id = baseball.player + baseball.year.astype(str)
baseball_newind = baseball.copy()
baseball_newind.index = player_id
baseball_newind.head()

In [None]:
baseball_newind.index.is_unique

Also you can just index by the three columns, like having three primary keys

In [None]:
baseball_h = baseball.set_index(['year', 'team', 'player'])
baseball_h.loc[(2007, 'ATL', 'francju01')]

### Add and remove columns

In [None]:
# Add columns (instead of just a number you can put a vector)
data['year'] = 2013
COUNTRIES["population_density"] = COUNTRIES["population"] / COUNTRIES["area"]

In [None]:
# Remove a columns is axis 1
data.drop("year", axis=1)

### Apply filters that are not just ==

In [None]:
# WITH LAMBDA
data[data["value"].apply(lambda x: x > 1000) & data["phylum"].apply(lambda x: str.endswith(x, "bacteria"))]


# WITH QUERY
baseball_newind.query('ab > 500')

# To refer to a local variable use @
min_ab = 500
baseball_newind.query('ab > @min_ab')

# Using loc you can look for several columns of one row 
baseball_newind.loc['gonzalu01ARI2006', ['h','X2b', 'X3b', 'hr']]

### You can use loc or just look for one column

In [None]:
hr2006 = baseball.loc[baseball.year==2006, 'hr']
hr2006.index = baseball.player[baseball.year==2006]

hr2007 = baseball.loc[baseball.year==2007, 'hr']
hr2007.index = baseball.player[baseball.year==2007]

### By position
using "iloc", like i from index

In [None]:
# The first five rows and the columns 5,6,7
baseball_newind.iloc[:5, 5:8]

# Sorting

In [None]:
baseball_newind.sort_index().head()
baseball_newind.sort_index("col_name", ascending=False).head()

# Sorting values from HR column
baseball.hr.sort_values(ascending=False).head(10)

### Exercise 5

Calculate **on base percentage** for each player, and return the ordered series of estimates.

$$obp = \frac{h + bb + hbp}{ab + bb + hbp + sf}$$

In [None]:
baseball['obp']=baseball.apply(
  lambda p: 
    (p.h+p.bb+p.hbp)/(p.ab+p.bb+p.hbp+p.sf) 
    if (p.ab+p.bb+p.hbp+p.sf) != 0.0 
    else 0.0, axis=1
)

# Group by

### Basic

In [None]:
country_features.groupby("world_region").head()
country_features.groupby("world_region", ascending=True).head()

### With more than one columns

convert Series to DataFrame: `TWO OPTIONS`

In [None]:
country_region = country_features.groupby("world_region")["happiness_score"].mean()
country_region = country_region.to_frame() # <--
country_region["n_countries"] = country_features.groupby("world_region")["happiness_score"].count()

country_region = country_region.sort_values("happiness_score", ascending=False)

In [None]:
average_by_region = country_features.groupby("world_region")['happiness_score'].agg(['mean','size'])
average_by_region.sort_values("mean", ascending=False)

# Merge

### Merge two tables

In [None]:
pd.merge(df1, df2)

by default, `merge` performs an **inner join** on the tables, meaning that the merged table represents an intersection of the two tables.

In [None]:
pd.merge(df1, df2, how='outer')

### Merge by a column (mmsi)
The left_index is to make a Join.
This way rows that doesn't have a corresponding row in the other table will be displayed

In [None]:
segments_merged = pd.merge(vessels, segments, left_index=True, right_on='mmsi')
segments_merged = pd.merge(vessels, segments, left_index=True, left_of="my_column", right_on='mmsi')

""" DROP THE DUPLICATED COLUMNS JUST IN CASE """
country_features.drop("country_name", axis=1)


# Concatenation

A common data manipulation is appending rows or columns to a dataset that already conform to the dimensions of the exsiting rows or colums, respectively:

In [None]:
np.concatenate([np.random.random(5), np.random.random(5)])

In [None]:
pd.concat([mb1, mb2], axis=0).head()

However, the index is no longer unique, due to overlap between the two `DataFrames`.

In [None]:
pd.concat([mb1, mb2], axis=0).index.is_unique

# Apply

### Inline

In [None]:
country_features[country_features["literacy"] == 100].apply(
    lambda x: print(f"{x["world_region"]:<22} - {x["country"]:<10} ({x["happiness_score"]:.2f})"),
    axis=1 #""" <- AAAAAAAAAAAAAAA """
)

### Iterrows

In [None]:
for idx, row in country_features[country_features.literacy==100].iterrows():
    print("{} - {} ({})".format(row.world_region, row.country, row.happiness_score))

# NAN values

In [None]:
foo[foo.notnull()]
foo.dropna()

This can be overridden by passing the `how='all'` argument, which only drops a row when every field is a missing value.

In [None]:
data.dropna(how='all')

We can do this programmatically in Pandas with the `fillna` argument.

In [None]:
bacteria2.fillna(0)
data.fillna({'year': 2013, 'treatment':2})

Notice that `fillna` by default returns a new object with the desired filling behavior, rather than changing the `Series` or  `DataFrame` **in place**.

In [None]:
data['treatment'].fillna(2, inplace=True)

### Filling nans with other non nans

In [None]:
MOVIES['runtime'] = MOVIES['runtime'].combine_first(MOVIES['runtime_new'])

# Plotting
### Basic things

In [None]:
plt.yscale('log')


""" V1 """
plt.plot(depths, acuracy_scores, marker="o")

plt.xlabel('Max depth')
plt.ylabel('Accuracy')
plt.title(f'Accuracy vs max depth')
plt.ylim(0.9, 1.0)
plt.xlim(depths[0] - 1, depths[-1] + 1)
plt.tight_layout()


""" V2 """
plt.plot(models, losses, marker="o", label="Loss Curve")  

plt.xlabel('Model after x epochs')
plt.ylabel('Loss')
plt.title('Loss over epochs')
plt.ylim(min(losses) - 0.1, max(losses) + 0.1)  
plt.legend()
plt.grid(True)
plt.tight_layout()

### Legend plot: `Label="..."`

In [None]:
plt.plot(t_train, train_acc_history, label="Train")
plt.plot(t_val, val_acc_history, label="Val")
...   
plt.legend()
...

### Normal plot


In [None]:
baseball.plot.scatter(x='hr', y='X2b')
baseball.plot.scatter(x='ab', y='h')

### Histogram and Barplot

In [None]:
# Frecuency
baseball['ab'].hist() 

# By another variabble
movies_by_genre = movies.groupby("Main_Genre")["imdb_rating"].mean()
sns.barplot(x=movies_by_genre.index, y=movies_by_genre.values, palette="viridis")

### Staked barplot

In [None]:
column_names = ['Actors 0-20', 'Actors 20-30', 'Actors 30-40', 'Actors 40-60', 'Actors 60+']
plot_df = pd.DataFrame(columns=column_names)
for _, genre in enumerate(NEW_GENRE["categories"]):
    filtered_df = MOVIES[MOVIES["new_genres"].apply(lambda x: genre in x)]
    plot_df.loc[len(plot_df)] = filtered_df[column_names].mean()
    

plot_df.plot(kind='bar', stacked=True, figsize=(14, 8))

# Ajouter des titres et labels
plt.title("Répartition des Acteurs par Classe d'Âge pour Chaque Genre de Film", fontsize=16)
plt.xlabel("Genre de Film", fontsize=14)
plt.ylabel("Nombre d'Acteurs", fontsize=14)
plt.legend(title="Tranche d'Âge", fontsize=12)

# Afficher le graphique
plt.tight_layout()
plt.show()


### Scatter plot

In [None]:
plt.scatter(CLEAN['Age'], CLEAN['Price'], alpha=0.6, edgecolors='k')

plt.figure(figsize=(9, 5))
plt.scatter(CLEAN['Age'], CLEAN['Price'], alpha=0.6, edgecolors='k')
plt.title("Car Age vs Price", fontsize=20)
plt.xlabel("Car Age (years)", fontsize=13)
plt.ylabel("Price (CHF)", fontsize=13)
plt.xticks(rotation=90)

plt.show()

## Multiplot

- plt.subplot(nrows, ncolums, actualPoltIndex)
- plt.figure(figsize=(wide, height))
- plt.tight_layout() -> avoid overlapping

In [None]:
heatmap_data_gross = movies.pivot_table(index='Main_Genre', columns='studio', values='worldwide_gross', aggfunc='mean')
heatmap_data_rating = movies.pivot_table(index='Main_Genre', columns='studio', values='imdb_rating', aggfunc='mean')

plt.figure(figsize=(12, 6))

plt.subplot(1,2,1)
sns.heatmap(heatmap_data_gross, cmap='YlGnBu')
plt.title("Heat map of Genre x Studio by gross")

plt.subplot(1,2,2)
sns.heatmap(heatmap_data_rating, cmap='YlGnBu')
plt.title("Heat map of Genre x Studio by rating")

plt.tight_layout() 

## Seaborn
### Cool plottings

In [None]:
sns.jointplot(x=movies['worldwide_gross'], y=movies['imdb_rating'], kind="hex")

### Heat map
First create a pivot table (rows x columns each with a value) and the plot it

In [None]:
heatmap_data = movies.pivot_table(index='Main_Genre', columns='studio', values='worldwide_gross', aggfunc='mean') #mean sum median...

sns.heatmap(heatmap_data, cmap='YlGnBu') #cmap = color

## Mathplot

In [None]:
plt.errorbar(mean_movie_gross_year.index, mean_movie_gross_year)

plt.title('Mean Worldwide Gross per Year with Error Bounds')
plt.xlabel('Year')
plt.ylabel('Worldwide Gross ($)')