# Pandas 

### Creating a dataframe

**From a dictionary**

In [3]:
import pandas as pd
# Create a dictionary containing the data
data = {'Marks':[99, 98, 95, 94,90,88],
        'Name':['Tom', 'Jack', 'nick', 'juli','peter','paul']}

# Create a pd DataFrame. 
df = pd.DataFrame(data) 
  
# print the dataframe
df

Unnamed: 0,Marks,Name
0,99,Tom
1,98,Jack
2,95,nick
3,94,juli
4,90,peter
5,88,paul


**From a list of dictionaries**

In [None]:
data = [{"mark":99,"name":"john"},{"mark":90,"name":"peter"},{"mark":90,"name":"peter"}]
df = pd.DataFrame(data)
df

**From a list of lists**

In [5]:
data = [['John','Darwin'],['Peter','Cooper']]
df = pd.DataFrame(data, columns =['First Name', 'Last Name'])
df

Unnamed: 0,First Name,Last Name
0,John,Darwin
1,Peter,Cooper


**From arrays**

In [None]:
import numpy as np
import pandas as pd

# Creating a 2 dimensional numpy array
data = np.array([[5.8, 2.8], [6.0, 2.2]])
print(data)

# Creating pandas dataframe from numpy array
adf = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
print(adf)

**From a Pandas Series**

In [None]:
s = pd.Series(["a", "b", "c"],
              name="vals")
s.to_frame()

**Specify cells, index and headers**

In [None]:
# Specify the content of a cell
abcdf = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], index=[4, 5, 6], columns=['A', 'B', 'C'])
abcdf

**Get the content of a cell**

In [None]:
abcdf.at[4, 'B']

**Changing the content of a cell**

In [None]:
abcdf.at[4, 'B'] = 3
abcdf

**From a csv file**

In [None]:
import pandas as pd
# Create a dataframe from a csv file
food_df = pd.read_csv("../data/food.csv", sep = ",")


# Print out the top **From a csv file**5 rows
food_df.head()

**From a json file**

In [None]:
import pandas as pd
eli5df = pd.read_json (r'../data/eli5.json')
eli5df.head()

## Inspecting a dataframe

**Number of rows and columns**

In [None]:
# How many rows and columns ?
rows, cols = food_df.shape
print("There are %d rows and %d columns"%(rows,cols))

In [None]:
#View the number of rows
df.index

**Headers**

In [None]:
# View the headers
eli5df.columns

**Dispersion statistics**

In [None]:
# Get Dispersion Statistics
food_df.describe()

**Checking for NULLs**

In [None]:
# Get the number of nulls for each columns
food_df.isnull().sum(axis = 0)

## Creating subframes

**Extracting a column**

In [None]:
# Retrieve a column (first 5 rows)
eli5df['question'].head()

**Converting a column to a list**

In [None]:
eli5df['question'].tolist()

**Extracting multiple columns**

In [None]:
# Retrieve multiple columns (
eli5df[['question','author_name']].head()

**Extracting a row**

In [None]:
# Retrieve row
eli5df.loc[1]

**Extracting multiple rows**

In [None]:
# Retrieve multiple rows
eli5df.loc[1:2]

**Dropping some columns**

In [None]:
# Creating a new frame by dropping some columns
X = eli5df.drop(['question','author_name'], axis = 1)
X.head()

## Modifying  a dataframe

**Retrieve a subframe that satisfies a condition**

In [None]:
# Retrieve a subframe that satisfies a condition
eli5df.head()
eli5df[eli5df["flair"] == "Physics"]

**Shuffling a pandas frame**

In [None]:
from sklearn.utils import shuffle
df = shuffle(X)
df.head()

**Applying a function to a column**

In [None]:
# Applying a function to a column
def list_len(string):
    return len(string.split(' '))
    
eli5df["question"].apply(list_len).head()

**Getting the type of a column**

In [None]:
eli5df["question"].dtypes

**Concatenating two frames**

In [None]:
Q = eli5df["question"]
Qcounts = eli5df["question"].apply(list_len)
C = pd.Series(Qcounts,name="Headers")
newdf = pd.concat([Q,C],axis = 1)
newdf

**Extracting all text from a colum**

In [None]:
import pandas as pd
data = {'Family_Name':["de la porte", "brown", "van de Berg", "Pater","Caret","Starlet"]}

# Create a pd DataFrame. 
df = pd.DataFrame(data) 
  
# print the dataframe
df
df['Family_Name'].str.cat(sep=" ")

**Handling Columns with List values**

In [None]:
data = {'Marks':[99, 98, 95],
        'Name':[['Tom', 'Jack'], ['nick'], ['juli','peter','paul']]}

# Create a pd DataFrame. 
df = pd.DataFrame(data) 
df.head()

In [None]:
df.index
l = []
for i in range(df.shape[0]):
    print(df["Name"][i])
    l = l+(df["Name"][i])
l


**Get the size of each group**

For each value in a  colum, get the number of rows having that value

In [None]:
# size of each group
print(df.groupby('Marks').size())

## Exporting a dataframe to a file

In [None]:
# to a csv file
df.to_csv('file_name.csv', encoding='utf-8')

In [None]:
# to an excel file
df.to_excel('file_name.xlsx', encoding='utf-8')