<a href="https://colab.research.google.com/github/MLUISG/Functions/blob/main/Just_the_Functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def show_me(df):
  """Displays info, describe, head, and tail

Money in the bank,
Show me the stats and details,
Info, head, tail, done."""
  print('Info')
  print(df.info())
  print('\n\n')
  print('Described')
  print(df.describe().T)
  print('\n\n')
  print('Head')
  print(df.head().T)
  print('\n\n')
  print('Tail')
  print(df.tail().T)

In [None]:
def split_df_by_dtype(df):
    """
    Splits a Pandas DataFrame into two separate DataFrames:
    one for columns with object dtype, and another for columns with non-object dtype.

    Parameters:
        - df (pandas.DataFrame): The DataFrame to split.

    Returns:
        - (pandas.DataFrame, pandas.DataFrame): A tuple containing two DataFrames:
            1. The DataFrame containing the object columns.
            2. The DataFrame containing the non-object columns.
    """
    object_cols = df.select_dtypes(include='object')
    non_object_cols = df.select_dtypes(exclude='object')
    return object_cols, non_object_cols

In [None]:
def find_tuple_cols(df):
  """Identifies tuples in all columns in a dataframe

Tuple columns sought,
Amidst the data's great breadth,
Found and returned whole."""
  tuple_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, tuple)).any()]
  return tuple_cols

In [None]:
def convert_tuples_to_lists(df):
  """Loops through columns, converting tuples

From tuples to lists,
DataFrame transformed with ease,
Data now unbound."""
  for col in df.columns:
      if type(df[col][0]) == tuple:
          df[col] = df[col].apply(list)
  return df

In [None]:
def remove_outliers(df):
  """Removes outliers from all columns in a dataframe with a threshold of 3* STD

Outliers expelled,
Data purged of impurities,
Clean and pristine now."""
  cleaned_data = df
  for col in df.columns:
      mean = df[col].mean()
      std = df[col].std()
      threshold = 3 * std
      lower_bound = mean - threshold
      upper_bound = mean + threshold
      cleaned_data[col] = df[col][(df[col] > lower_bound) & (df[col] < upper_bound)]
  return cleaned_data

In [None]:
def impute_means(df):
  """Imputes the mean value of a column to all null values in that column

Missing values found,
Impute means to numeric cols,
Clean data once more."""
  for col in df.columns:
        if df[col].isna().sum() == 0:
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col].fillna(df[col].mean(), inplace=True)
  return df

In [None]:
def all_the_value_counts(df):
    """Prints the value counts for all columns in a pandas DataFrame

Counting values wide,
All columns, one by one revealed,
Insight to be found."""
    for column in df.columns:
        print(f"Value counts for {column}:")
        print(df[column].value_counts())
        print("\n")

In [None]:
def count_the_cats(df):
    """Prints the value counts for categorical columns in a pandas DataFrame

Meow, count the cats,
Categorical values,
Pandas purrs content.
    """
    for column in df.select_dtypes(include=["category", "object"]).columns:
        print(f"Value counts for {column}:")
        print(df[column].value_counts())
        print("\n")

In [None]:
def count_nulls(data):
    """Checks for null values and displays only nulls

Null values abound,
Display them with a function,
Data cleaning done."""
    temp = data.isna().sum()
    filter = temp > 0
    print(data.shape)
    print(temp[filter])

In [None]:
def get_sorted_correlations(df):
    corrs = df.corr().round(2).unstack().abs()
    corrs = corrs[corrs < 1]
    return corrs.sort_values(ascending = False)

In [None]:
def per_nulls(data):
    """Checks for null values and displays their percentage.

Null values lurking,
Percentage uncovered,
Data now refined."""
    total = data.isna().sum().sort_values(ascending=False)
    percent = round((data.isna().sum()/data.isna().count()*100), 2).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data = missing_data[missing_data['Total'] > 0]
    print("Missing data percentage:\n", missing_data)

In [None]:
def cull_nulls(df, threshold):
    """
    Deletes columns from a Pandas DataFrame if they have more null values than the specified threshold.

    Parameters:
        - df (pandas.DataFrame): the DataFrame to process.
        - threshold (float): the maximum percentage of null values allowed for a column to be kept.

    Returns:
        - pandas.DataFrame: the processed DataFrame.

A data grave threat,
Nulls swarm and infect the set,
Cull them, make it clean.
    """
    null_counts = df.isnull().sum()
    null_percentages = null_counts / len(df) * 100
    to_drop = null_percentages[null_percentages > threshold].index
    return df.drop(to_drop, axis=1)

In [None]:
def forest_features(df, target):
  """
  This function uses random forest regression to predict a target variable from a given pandas DataFrame.
It splits the data into training and testing sets, trains a random forest regressor model on the training set,
and calculates the root mean squared error on the testing set. This process is repeated multiple times to
calculate the mean RMSE. The function also plots the feature importance in descending order to
provide insights into which features are most important for the model's predictions

Random forest grows,
Predicting with learned powers,
Features, insights flow.
  """
  X = df.drop(target, axis = 1)
  y = df[target]
  numLoops = 1

  mean_error = np.zeros(numLoops)
  np.random.seed(42)
  for idx in range(0,numLoops):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    model = RandomForestRegressor(n_estimators = 10, random_state=0)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mean_error[idx] = mean_squared_error(y_test, y_pred)

  print(f'RMSE: {np.sqrt(mean_error).mean()}')
  np.sqrt(mean_error)[:50]

  importances = model.feature_importances_
  importances
  forest_importances = pd.Series(importances, index=X.columns)
  forest_importances.sort_values(ascending=False, inplace=True)
  plt.figure()
  forest_importances.plot.bar()
  plt.title("Feature importance")
  plt.ylabel("Mean decrease in impurity")