<h2>Pandas Cheat Sheet</h2>
<h3>Topics Covered:</h3>
<ul>
    <li>Basic CRUD Operation</li>
    <li>Viewing DataFrame Info</li>
    <li>Regression</li>
</ul>

<h3>Create</h3>

In [None]:
# Let's start with an example of how regression might be performed for data representing
# the price of a stock over time

import pandas as pd

# Load the data into a pandas dataframe
df = pd.read_csv('AAPL.csv')

# The following lines demonstrate how to perform exploratory data analysis on the data
# Print the first 5 rows of the dataframe
print(df.head())

In [None]:
# Print the data types of each column along with the number of non-null values
print(df.info())

In [None]:
  # Print summary statistics for each column (for numeric columns only)
print(df.describe())

In [None]:
  # Print summary statistics for each column (for non-numeric columns only)
print(df.describe(include='object'))

In [None]:
'''
Accessing columns in a pandas dataframe - columns are the keys
- In many cases, if a column intends to represent dates, the type might need to be cast to datetime
- if the type is 'object' it may or may not be in datetime, and may need to be cast as such.
- 'YYYY-MM-DD' is the a common format for dates in pandas
'''

df['Date'] = pd.to_datetime(df['Date'])

# 'Date' column will now be of type datetime64
print(df.info())

In [None]:
# Access multiple columns by passing a list of column names
print(df[['Open', 'Close']])

In [None]:
# Now, let's perform some actual analysis and regression on the data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
train_data = df['Close'][:len(df) - 100]  # closing prices up from beginnning up to last 100 data points
test_data = df['Close'][-100:]  # last 100 data points for testing

# Create and fit the ARIMA model
model = ARIMA(train_data, order=(1, 1, 1))
model_fit = model.fit()

# Make predictions on the testing data
predictions = model_fit.forecast(steps=len(test_data))

# Evaluate the model's performance
mse = mean_squared_error(test_data, predictions)  # measure average squared difference between actual vs predicted values
rmse = np.sqrt(mse)  
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Visualize the actual and predicted stock prices
plt.figure(figsize=(10, 6))
plt.plot(train_data.index, train_data, label='Training Data')
plt.plot(test_data.index, test_data, label='Actual Price')
plt.plot(test_data.index, predictions, label='Predicted Price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Stock Price Prediction')
plt.legend()
plt.show()

<h3>Removing leading/trailing whitespace</h3>

In [None]:
import pandas as pd
import numpy as np

num_rows = 10

# Create a dictionary of data w leading/trailing spaces in column names and values
cols = {
    'ID': np.arange(num_rows),
    '  Programming Langauge   ': np.random.choice(['Java   ', '   Python', ' JavaScript  '], num_rows),
    '  Score  ': np.random.randint(60, 100, num_rows)
}

# Create a dataframe from the dictionary
df = pd.DataFrame(cols)

# For loop can be used to remove leading/trailing spaces from column names
# Removing leading/trailing spaces from all columns
df = df.rename(columns={col: col.strip() for col in df.columns})

print(df.columns)

In [None]:
import pandas as pd
import numpy as np

# We can also use a lambda expression to perfrom the same operation

num_rows = 10

# Create a dictionary of data w leading/trailing spaces in column names and values
cols = {
    'ID': np.arange(num_rows),
    '  Programming Langauge   ': np.random.choice(['Java   ', '   Python', ' JavaScript  '], num_rows),
    '  Score  ': np.random.randint(60, 100, num_rows)
}

# Create a dataframe from the dictionary
df = pd.DataFrame(cols)

# For loop can be used to remove leading/trailing spaces from column names
# Removing leading/trailing spaces from all columns
df = df.rename(columns=lambda x: x.strip())

print(df.columns)


<h3>Reading a TSV file</h3>

In [None]:
# Same steps, only use sep='\t' to indicate that the data is tab-separated
df = pd.read_csv('data.txt', sep='\t')

<h3>Reading an Excel file</h3>

In [None]:
# Basic reading
pd.read_excel('data.xlsx')

In [None]:
# Specify the sheet name
pd.read_excel('data.xlsx', sheet_name='Sheet1')

In [None]:
# Skip rows if necessary
pd.read_csv('data.csv', skiprows=2)

In [None]:
# Read sheet names
pd.ExcelFile('data.xlsx').sheet_names

<h3>Basic Statistics</h3>

In [None]:
# Obtain min/mean/max of non-obect columns
print(df.describe())

In [None]:
# Calculate the mean of a specific column
df['column_name'].mean()

In [None]:
# Calculate median of a specific column
df['column_name'].median()

In [None]:
# Calculate the mode of a specific column
df['column_name'].mode()

In [None]:
# Calcuate min/max of a specific column
df['column_name'].min()
df['column_name'].max()

<h3>Miscellaneous Data Frame Formatting</h3>

In [None]:
# Transpose a dataframe
df_transposed = df.T