In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from transformers import DistilBertTokenizer, TFDistilBertModel
import json


In [None]:
# Step 1: Load and Preprocess Data
data = pd.read_csv('/home/ahmedabdullahi/NLP590/NLPJobsFinder/Data/modelData.csv')

# Normalize text fields
data['city'] = data['city'].str.lower()
data['country'] = data['country'].str.lower()

# Encode categorical fields
capital_map = {'primary': 1, 'admin': 0.5, 'minor': 0.2, '': 0}
data['capital'] = data['capital'].map(capital_map)

# Normalize numeric fields
scaler = MinMaxScaler()
data[['lat', 'lng', 'population']] = scaler.fit_transform(data[['lat', 'lng', 'population']])



In [None]:
# Tokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

def encode_text(text_series):
    return tokenizer(
        text_series.tolist(), 
        padding='max_length', 
        max_length=16, 
        truncation=True, 
        return_tensors="np"
    )


In [None]:

# Encode text fields
city_encodings = encode_text(data['city'])
country_encodings = encode_text(data['country'])

# Extract inputs
X_city = city_encodings['input_ids']
X_country = country_encodings['input_ids']
X_numeric = data[['lat', 'lng', 'population']].values

# Example binary labels (modify this for your task)
labels = np.random.randint(0, 2, size=(len(data)))

# Train-test split
X_city_train, X_city_val, X_country_train, X_country_val, X_numeric_train, X_numeric_val, y_train, y_val = train_test_split(
    X_city, X_country, X_numeric, labels, test_size=0.2, random_state=42
)

In [None]:

# Step 2: Build the Model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Inputs
input_city = tf.keras.Input(shape=(32,), dtype=tf.int32, name='city_input')
input_country = tf.keras.Input(shape=(32,), dtype=tf.int32, name='country_input')
input_numeric = tf.keras.Input(shape=(3,), name='numeric_input')

# Text feature extraction
city_features = bert_model(input_city).last_hidden_state
city_features = tf.keras.layers.GlobalAveragePooling1D()(city_features)

country_features = bert_model(input_country).last_hidden_state
country_features = tf.keras.layers.GlobalAveragePooling1D()(country_features)

# Merge features
merged_features = tf.keras.layers.Concatenate()([city_features, country_features, input_numeric])

# Dense layers
dense = tf.keras.layers.Dense(128, activation='relu')(merged_features)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=[input_city, input_country, input_numeric], outputs=output)

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Step 3: Train the Model
history = model.fit(
    [X_city_train, X_country_train, X_numeric_train],
    y_train,
    validation_data=([X_city_val, X_country_val, X_numeric_val], y_val),
    epochs=3,
    batch_size=64
)


: 

In [None]:

# Step 4: Save the Model
model.save('location_nlp_model')
tokenizer.save_pretrained('location_tokenizer')
scaler_path = 'scaler.pkl'
import joblib
joblib.dump(scaler, scaler_path)

In [None]:


# Step 5: Load the Model and Use
# Load the model
loaded_model = tf.keras.models.load_model('location_nlp_model')
loaded_tokenizer = BertTokenizer.from_pretrained('location_tokenizer')
loaded_scaler = joblib.load(scaler_path)

In [None]:





# Example Usage
def predict_location(city, country, lat, lng, population):
    city_encoded = loaded_tokenizer(
        city, padding='max_length', max_length=32, truncation=True, return_tensors="np"
    )['input_ids']
    country_encoded = loaded_tokenizer(
        country, padding='max_length', max_length=32, truncation=True, return_tensors="np"
    )['input_ids']
    numeric_data = loaded_scaler.transform([[lat, lng, population]])
    prediction = loaded_model.predict([city_encoded, country_encoded, numeric_data])
    return prediction[0][0]

# Example JSON Input
example_json = {
    "city": "louisville",
    "country": "usa",
    "lat": 38.2527,
    "lng": -85.7585,
    "population": 617638
}

# Predict
predicted_value = predict_location(
    example_json["city"], 
    example_json["country"], 
    example_json["lat"], 
    example_json["lng"], 
    example_json["population"]
)

print("Predicted Value:", predicted_value)