In [None]:
import pandas as pd
from pymongo.mongo_client import MongoClient
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import json
from flask import Flask, request, jsonify

app = Flask(__name__)

uri = "mongodb+srv://seymur:seymur23@hagosmarketing.8mru08u.mongodb.net/?retryWrites=true&w=majority&appName=HagosMarketing"
client = MongoClient(uri)


# Cleaning data for persona

db = client['Big5_US_Data']

collection = db['Big5']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
person_data = pd.DataFrame(documents_list)
person_data.drop('_id', axis = 1, inplace = True)


features_without_null_values = []
features_with_null_values = []

null_fractions = person_data.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        
        
person_data.drop(features_with_null_values, axis = 1, inplace = True)
person_data.dropna(inplace = True)

bins = [0, 15, 19, 24, 34, 44, 54, 64, float('inf')]
labels = ['age_0_15', 'age_16_19', 'age_20_24', 'age_25_34', 'age_35_44', 'age_45_54', 'age_55_64', 'age_65']

person_data['age_group'] = pd.cut(person_data['age'], bins=bins, labels=labels, right=False)


# Cleaning data for persona 

db = client['US_Census_Data']

collection = db['Median_Age_Occupation']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
accupation_for_age = pd.DataFrame(documents_list)
accupation_for_age.drop('_id', axis = 1, inplace = True)

features_without_null_values = []
features_with_null_values = []

null_fractions = accupation_for_age.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        
accupation_for_age.drop(features_with_null_values, axis = 1, inplace = True)
accupation_for_age.dropna(inplace = True)


age_categories = [
    "age_16_19",
    "age_20_24",
    "age_25_34",
    "age_35_44",
    "age_45_54",
    "age_55_64",
    "age_65",
]

# List to store the transformed data
transformed_data = []

# Transform the data
for index, row in accupation_for_age.iterrows():
    for i, age_category in enumerate(age_categories):
        transformed_data.append({
            "occupation": row['occupation'],
            "age_group": age_category,
            "count_people": row['median_age'][i],
            "median_age": row['median_age'][-1]
        })

# Create the new DataFrame
accupation_for_age2 = pd.DataFrame(transformed_data)




# Cleaning data for apply after filter



# Cleaning of economy count level
db = client['ACS_Nationwide']

collection = db['Economics County Level']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

info_about_state = pd.DataFrame(documents_list)
info_about_state.drop('_id', axis = 1, inplace = True)

features_without_null_values = []
features_with_null_values = []

null_fractions = info_about_state.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        

info_about_state.drop(features_with_null_values, axis = 1, inplace = True)

info_about_state.columns = info_about_state.iloc[0]  
info_about_state = info_about_state[1:]  
info_about_state = info_about_state.reset_index(drop=True)


# Cleaning economics state

db = client['ACS_Nationwide']

collection = db['Economics State']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
economic_about_state = pd.DataFrame(documents_list)
economic_about_state.drop('_id', axis = 1, inplace = True)
df_melted = pd.melt(economic_about_state, id_vars=['Label (Grouping)'], var_name='State_Column', value_name='Value')
# Pivot the DataFrame
economic_about_state_pivot = df_melted.pivot_table(index='State_Column', columns='Label (Grouping)', values='Value', aggfunc='first')
# Reset the index to turn 'State_Column' into a column
economic_about_state_pivot.reset_index(inplace=True)

features_without_null_values = []
features_with_null_values = []

null_fractions = economic_about_state_pivot.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        
        
economic_about_state_pivot.drop(features_with_null_values, axis = 1, inplace = True) 

economic_about_state_pivot = economic_about_state_pivot[economic_about_state_pivot['State_Column'].str.contains('Estimate')]
# Extract state names
economic_about_state_pivot['State_Name'] = economic_about_state_pivot['State_Column'].str.split('!!').str[0]
economic_about_state_pivot.drop('State_Column', axis = 1, inplace = True)

economic_about_state_pivot.rename(columns = {'State_Name':'state'}, inplace = True)

economic_about_state_pivot['state'] = economic_about_state_pivot['state'].apply(lambda x: x.lower())


# Cleaning social

db = client['ACS_Nationwide']

collection = db['Social Characteristics']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
social_char = pd.DataFrame(documents_list)
social_char.drop('_id', axis = 1, inplace = True)

df_melted = pd.melt(social_char, id_vars=['Label (Grouping)'], var_name='State_Column', value_name='Value')

# Pivot the DataFrame
social_char_pivot = df_melted.pivot_table(index='State_Column', columns='Label (Grouping)', values='Value', aggfunc='first')

# Reset the index to turn 'State_Column' into a column
social_char_pivot.reset_index(inplace=True)

features_without_null_values = []
features_with_null_values = []

null_fractions = social_char_pivot.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        
        
social_char_pivot.drop(features_with_null_values, axis = 1, inplace = True)
social_char_pivot = social_char_pivot[social_char_pivot['State_Column'].str.contains('Estimate')]
# Extract state names
social_char_pivot['State_Name'] = social_char_pivot['State_Column'].str.split('!!').str[0]

social_char_pivot.rename(columns = {'State_Name':'state'}, inplace = True)
social_char_pivot['state'] = social_char_pivot['state'].apply(lambda x: x.lower())
social_char_pivot.drop('State_Column', axis = 1, inplace = True)


# Cleaning demographic

db = client['ACS_Nationwide']

collection = db['demographic characteristics - US & State level']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
demographic_df = pd.DataFrame(documents_list)
demographic_df.drop('_id', axis = 1, inplace = True)

df_melted = pd.melt(demographic_df, id_vars=['Label (Grouping)'], var_name='State_Column', value_name='Value')
# Pivot the DataFrame
demographic_pivot = df_melted.pivot_table(index='State_Column', columns='Label (Grouping)', values='Value', aggfunc='first')
# Reset the index to turn 'State_Column' into a column
demographic_pivot.reset_index(inplace=True)


features_without_null_values = []
features_with_null_values = []

null_fractions = demographic_pivot.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
        
demographic_pivot.drop(features_with_null_values, axis = 1, inplace = True)       

demographic_pivot = demographic_pivot[demographic_pivot['State_Column'].str.contains('Estimate')]
# Extract state names
demographic_pivot['State_Name'] = demographic_pivot['State_Column'].str.split('!!').str[0]
demographic_pivot.drop('State_Column', axis = 1, inplace = True)

demographic_pivot.rename(columns = {'State_Name':'state'}, inplace = True)
demographic_pivot['state'] = demographic_pivot['state'].apply(lambda x: x.lower())


# Cleaning information

db = client['Persona']

collection = db['Information']

documents = collection.find({})

# Convert the documents to a list
documents_list = list(documents)

# Convert the list of documents to a pandas DataFrame
accupa_wage = pd.DataFrame(documents_list)
accupa_wage.drop('_id', axis = 1, inplace = True)


features_without_null_values = []
features_with_null_values = []

null_fractions = accupa_wage.isnull().mean()

for i, null_fraction in null_fractions.items():
    null_percentage = null_fraction * 100
    if null_fraction == 1 or null_fraction > 0.2:
        features_with_null_values.append(i)
    else:
        features_without_null_values.append(i)
    
    
    
accupa_wage.drop(features_with_null_values, axis = 1, inplace = True)
accupation_data = []

for index, row in accupa_wage.iterrows():
    occupation_dict = row['occupation']
    if 'job' in occupation_dict and 'employment' in occupation_dict:
        accupation_data.append({
            'state': row['state'],
            'occupation': occupation_dict['job'],
            'employment': occupation_dict['employment'],
            'income': occupation_dict['income']
        })
accupa_wage_df = pd.DataFrame(accupation_data)




# The prediction part

@app.route('/predict', methods=['POST'])
def predict():
    input_features = request.get_json()
    
    input_df = pd.DataFrame([input_features])

    # Filter and transform data based on input features
    filter_df_persona = person_data[(person_data[['gender', 'country', 'state']] == input_df[['gender', 'country', 'state']].values[0]).all(axis=1)]
    filter_df_accupation = accupation_for_age2[accupation_for_age2['occupation'] == input_df['occupation'].loc[0]]

    scaler = StandardScaler()
    df_scaled_persona = pd.DataFrame(scaler.fit_transform(filter_df_persona[['age']]), index=filter_df_persona.index)
    input_scaled = pd.DataFrame(scaler.transform(input_df[['age']]), index=input_df.index)

    knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
    knn.fit(df_scaled_persona)

    distances, indices = knn.kneighbors(input_scaled)
    top_10_similar_persons_knn = df_scaled_persona.iloc[indices.flatten()]
    indexes = top_10_similar_persons_knn.index.to_list()

    result_df_persona = filter_df_persona[filter_df_persona.index.isin(indexes)]

    search_df = pd.merge(result_df_persona, filter_df_accupation, on='age_group', how='left')
    search_df2 = pd.merge(search_df, accupa_wage_df, on = ['state','occupation'], how = 'left')
    search_df3 = pd.merge(search_df2, demographic_pivot, on = 'state', how = 'left')
    search_df4 = pd.merge(search_df3, social_char_pivot, on = 'state', how = 'left')
    search_df5 = pd.merge(search_df4, economic_about_state_pivot, on = 'state', how = 'left')

    features_without_null_values = []
    features_with_null_values = []
    
    null_fractions = search_df5.isnull().mean()
    
    for i, null_fraction in null_fractions.items():
        null_percentage = null_fraction * 100
        if null_fraction == 1 or null_fraction > 0.2:
            features_with_null_values.append(i)
        else:
            features_without_null_values.append(i)
            
    search_df5.drop(features_with_null_values, axis = 1, inplace = True)
    search_df5 = search_df5.loc[:, ~search_df5.isin(['(X)']).any()]

    # Convert the final dataframe to JSON and return
    return jsonify(search_df5.to_dict(orient='records'))

if __name__ == "__main__":
    app.run(debug=True)