# File to process data from our open survey (e. g. reddit)
## from importing data to processing and visualizing


In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Importing data

In [3]:
# Importing data
df_reddit_data = pd.read_csv('data/umfrage-nachhaltige-mobilitaet_offen.csv')

## Methods for data processing

In [None]:
def explore_data(data):
    #Display a summary of the data
    # args: data: pandas dataframe
    # Summarize the data
    print(data.info())
    # Display the first 5 rows of the data
    print(data.head())
    
def drop_data(data):
    # Drop non-uniques and missing values
    # args: data: pandas dataframe
    # Drop duplicates
    duplicate_free_data = data.drop_duplicates()
    #data.drop_duplicates(inplace=True)
    # Drop missing values
    na_free_data = duplicate_free_data.dropna(inplace=True)
    # Replace missing values with the mean
    #data.fillna(data.mean(), inplace=True)
    return na_free_data

def fill_data(data):
    # Fill missing values with the mean
    # args: data: pandas dataframe
    #data.fillna(data.mean(), inplace=True)
    data.fillna(method='ffill').fillna(method='bfill')
    return data

def split_data(data):
    # split data for use case
    # args: data: pandas dataframe
    # Split the data into features and target
    data_without_na = data.dropna()
    selected_columns_finace = ['financial_situation', 'financial_situation_future', 'financial_situation_past']#TODO select proper columns for correlation with financial situation
    selected_columns_mobility = ['mobility', 'mobility_future', 'mobility_past']#TODO select proper columns for correlation with mobility
    selected_columns_other = ['age',''] #TODO select proper columns for correlation with 'other' data
    data_financial_correlation = data[selected_columns_finace]
    data_mobility_correlation = data[selected_columns_mobility]
    data_other_correlation = data[selected_columns_other]
    return data, data_without_na, data_financial_correlation, data_mobility_correlation, data_other_correlation

def prepare_data(data, feature_columns, target_column):
    # Prepare the data for analysis
    # args: data: pandas dataframe
    # args: feature_column: (list): liste der spalten die als feature genutzt werden sollen
    # args: target_column: (string): name zielvariable
    #return: tuple: X_train, X_test, y_train, y_test
    X = data[feature_columns]
    Y = data[target_column]
    # Split the data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale_data(X_train, X_test):
    # Scale the data
    # args: X_train: pandas dataframe
    # args: X_test: pandas dataframe
    #return: X_train_scaled, X_test_scaled
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

def train_model(X_train, y_train):
    # Train the model
    # args: X_train: pandas dataframe
    # args: y_train: pandas dataframe
    #return: model
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    # Evaluate the model
    # args: model: model
    # args: X_test: pandas dataframe
    # args: y_test: pandas dataframe
    #return: mse
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

def analyze_data(X_train, X_test, Y_train, Y_test):
    # Analyze the data
    # args: X_train: pandas dataframe
    # args: X_test: pandas dataframe
    # args: Y_train: pandas dataframe
    # args: Y_test: pandas dataframe
    #return: mse
    # X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
    # model = train_model(X_train_scaled, Y_train)
    # mse = evaluate_model(model, X_test_scaled, Y_test)
    # return mse
    model = LinearRegression()
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    print('Mean Squared Error:', mse)
    return mse

def visualize_data(data):
    # TODO Visualize the data with matplotlib and seaborn
    # args: data: pandas dataframe
    #return: None
    # TODO Implement the visualization
    print('Visualizing the data')
    
def visualize_correlation(data, feature_columns, target_column):
    # TODO Visualize the correlation between the features and the target
    # args: data: pandas dataframe
    # args: feature_columns: (list): liste der spalten die als feature genutzt werden sollen
    # args: target_column: (string): name zielvariable
    #return: None
    correlation_matrix = data[feature_columns + [target_column]].corr()
    plt.figure(figsize=(10, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.show()

def visualize_model(model, X_test, y_test):
    # Visualize the model
    # args: model: model
    # args: X_test: pandas dataframe
    # args: y_test: pandas dataframe
    #return: None
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs Predicted')
    plt.show()
    
def process_data(data):
    # Process the data
    # args: data: pandas dataframe
    #return: None
    explore_data(data)
    drop_data(data)
    fill_data(data)
    split_data(data)
    visualize_data(data)
    visualize_correlation(data)
    visualize_model(data)
    analyze_data(data)
    return None

## Data Exploration

In [None]:
# Displaying the first 20 rows of the data

In [None]:
print(df_reddit_data.info())
df_reddit_data.head(20)
# get count of entries
#df_reddit_data.count()

## #Cleaning the data

In [None]:
# Checking for missing values
df_reddit_data.isnull().sum()

In [None]:
# Checking for duplicates
df_reddit_data.duplicated().sum()

In [None]:
# Checking for unique values in the data
df_reddit_data.nunique()

In [None]:
# Dropping the duplicates
df_reddit_data.drop_duplicates(inplace=True)

In [None]:
# Dropping the missing values with the mode
df_reddit_data.dropna(inplace=True)

In [None]:
# Replacing the missing values with the average values and fitting strings
df_reddit_data.fillna(df_reddit_data.mean(), inplace=True)

## Data Preprocessing

In [None]:
# Checking the data types of the columns
df_reddit_data.dtypes


## Data Analysis

In [None]:
# Checking the distribution of the data
df_reddit_data.describe()

## Data Visualization

In [None]:
# Plotting the distribution of the data
df_reddit_data.hist(bins=50, figsize=(20, 15))
plt.show()
# add seaborn style
sns.set()
# add title and description
plt.title('Distribution of the data')
plt.xlabel('Values')
plt.ylabel('Frequency')
