In [12]:
import pandas as pd
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

warnings.filterwarnings('ignore')

In [13]:
population_df = pd.read_csv('population_df_1996.csv')
migration_df = pd.read_csv('migration_df_1996.csv')

### **Binary Classification for Predicting Emigration**

------------------------------------------------------------------------------------------------
## **1st Try**

In [14]:
# Now that we've confirmed the correct strings, let's filter the data again
migration_filtered = migration_df[
    (migration_df['Country'] == 'All countries') &
    (migration_df['Sex'] == 'Both sexes')
]

# Adjust the filtering to match the exact strings in 'Origin or Destination' column
#emigrants_string = "Emigrants: All destinations"
#immigrants_string = "Immigrants: All origins"

# Extract rows for 'Emigrants' and 'Immigrants' with the exact string match
emigration_data_corrected = migration_df[
    (migration_df['Country'] == 'All countries') &
    (migration_df['Sex'] != 'Both sexes') &
    (migration_df['Origin or Destination'] == 'Emigrants: All destinations')
]
#immigrants_df = migration_filtered[migration_filtered['Origin or Destination'] == immigrants_string]

# Pivot the data to get 'Emigrants' and 'Immigrants' on separate columns
#migration_pivot = migration_filtered.pivot(index='Year', columns='Origin or Destination', values='VALUE')

# Define a binary target variable where years with above-average emigration are labeled 1, and others 0
average_emigration_corrected = emigration_data_corrected['VALUE'].mean()
emigration_data_corrected['High_Emigration'] = (emigration_data_corrected['VALUE'] > average_emigration_corrected).astype(int)

# Multiply by 1000 to convert from thousands to absolute numbers
#migration_pivot[emigrants_string] *= 1000
#migration_pivot[immigrants_string] *= 1000

X_classification_corrected = emigration_data_corrected[['Year', 'VALUE']]
y_classification_corrected = emigration_data_corrected['High_Emigration']

# Add a binary target for emigration increase/decrease compared to the previous year
#migration_pivot['Emigration Increase'] = (migration_pivot[emigrants_string].diff() > 0).astype(int)

# Shift the binary target variable to the previous year to predict next year's increase
#migration_pivot['Emigration Increase'] = migration_pivot['Emigration Increase'].shift(-1)

# Drop the last row as it does not have a future year to compare for the binary target
#migration_pivot = migration_pivot.iloc[:-1]

#migration_pivot.reset_index(inplace=True)
#migration_pivot
# Split the data into training and test sets
train_size_classification_corrected = int(0.8 * len(X_classification_corrected))
X_train_classification_corrected, X_test_classification_corrected = X_classification_corrected[:train_size_classification_corrected], X_classification_corrected[train_size_classification_corrected:]
y_train_classification_corrected, y_test_classification_corrected = y_classification_corrected[:train_size_classification_corrected], y_classification_corrected[train_size_classification_corrected:]

# Check the distribution of the 'High_Emigration' variable in the corrected dataset
#emigration_data_distribution_corrected = emigration_data_corrected['High_Emigration'].value_counts()
# Check the distribution of the 'High_Emigration' variable in the corrected dataset
emigration_data_distribution_corrected = emigration_data_corrected['High_Emigration'].value_counts()

emigration_data_distribution_corrected, (X_train_classification_corrected.shape, X_test_classification_corrected.shape, y_train_classification_corrected.shape, y_test_classification_corrected.shape)


(High_Emigration
 1    31
 0    25
 Name: count, dtype: int64,
 ((44, 2), (12, 2), (44,), (12,)))

We have the following distribution for the binary target variable 'High_Emigration':

* '1' (above-average emigration): 31 data points
* '0' (below-average emigration): 25 data points

This distribution indicates a relatively balanced dataset, which is good for training a binary classification model.

The dataset has been split into training and test sets with the following shapes:

* Training features (X_train): 44 samples, 2 features
* Test features (X_test): 12 samples, 2 features
* Training target (y_train): 44 samples
* Test target (y_test): 12 samples

In [15]:
# Train the Logistic Regression model with the corrected data
logistic_model_corrected = LogisticRegression()
logistic_model_corrected.fit(X_train_classification_corrected, y_train_classification_corrected)

# Predict on the test set
y_pred_logistic_corrected = logistic_model_corrected.predict(X_test_classification_corrected)

# Calculate accuracy and generate a classification report
accuracy_logistic_corrected = accuracy_score(y_test_classification_corrected, y_pred_logistic_corrected)
classification_report_logistic_corrected = classification_report(y_test_classification_corrected, y_pred_logistic_corrected)

accuracy_logistic_corrected, classification_report_logistic_corrected


(0.9166666666666666,
 '              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00         1\n           1       0.92      1.00      0.96        11\n\n    accuracy                           0.92        12\n   macro avg       0.46      0.50      0.48        12\nweighted avg       0.84      0.92      0.88        12\n')

The logistic regression model has been trained and evaluated with the following results:

* Accuracy: 91.67%
* Classification Report:
* Precision for class 0 (below-average emigration) is undefined, which indicates that the model did not predict any instances of class 0. This is likely because of the imbalance in the distribution of the target variable in the test set or the model biasing towards the majority class.
* Recall for class 0 is also 0.00, meaning the model failed to correctly identify any of the actual instances of class 0.
* F1-score for class 0 is 0.00, which is the harmonic mean of precision and recall, indicating poor performance for this class.
* For class 1 (above-average emigration), the model has a precision of 0.92, indicating that when it predicts high emigration, it is correct 92% of the time.
* The recall for class 1 is 1.00, which means the model successfully identified all instances of high emigration.
* The F1-score for class 1 is 0.96, suggesting a good balance between precision and recall for this class.
  
The model's high accuracy is somewhat misleading due to its failure to predict any instances of the minority class (class 0). This is a common issue in binary classification tasks with imbalanced datasets, and it might be exacerbated in this case due to the small size of the dataset and the distribution of the target variable within the test set.

------------------------------------------------------------------------------------------------
## **2nd Try**

In [16]:
# Filter migration data for 'All countries', 'Both sexes', and 'Emigrants: All destinations'
migration_filtered = migration_df[
    (migration_df['Country'] == 'All countries') & 
    (migration_df['Sex'] == 'Both sexes') & 
    (migration_df['Origin or Destination'] == 'Emigrants: All destinations')
]

# For population, we only need the total population for 'Both sexes' and 'All ages'
population_filtered = population_df[
    (population_df['Sex category'] == 'Both sexes') & 
    (population_df['Single Year of Age'] == 'All ages')
]

# Now let's create the binary target for emigration
# First, we'll sort the migration data by year to ensure we're comparing the correct rows
migration_sorted = migration_filtered.sort_values('Year')

# Calculate the change in emigration compared to the previous year
migration_sorted['Emigration Change'] = migration_sorted['VALUE'].diff()

# Define the binary target: 1 if emigration increased, 0 if it decreased or stayed the same
migration_sorted['Increased Emigration'] = (migration_sorted['Emigration Change'] > 0).astype(int)

# We need to drop the first row since it won't have a previous year to compare to
migration_sorted.dropna(subset=['Emigration Change'], inplace=True)

# Let's view the resulting dataset
migration_sorted_filtered = migration_sorted[['Year', 'VALUE', 'Emigration Change', 'Increased Emigration']]
migration_sorted_filtered.head(7)


Unnamed: 0,Year,VALUE,Emigration Change,Increased Emigration
73,1997,25.3,-5.9,0
118,1998,28.6,3.3,1
163,1999,31.5,2.9,1
208,2000,26.6,-4.9,0
253,2001,26.2,-0.4,0
298,2002,25.6,-0.6,0
343,2003,29.3,3.7,1


In [17]:
# Define features (X) and target (y)
X = migration_sorted[['Year', 'VALUE']]
y = migration_sorted['Increased Emigration']

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

(accuracy, conf_matrix, class_report)

(0.3333333333333333,
 array([[1, 2],
        [2, 1]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.33      0.33      0.33         3\n           1       0.33      0.33      0.33         3\n\n    accuracy                           0.33         6\n   macro avg       0.33      0.33      0.33         6\nweighted avg       0.33      0.33      0.33         6\n')