In [7]:
## TO IMPROVE
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

# Read data for old projects from Excel file
df = pd.read_excel('Classeur2.xlsx')

# Define the data for new projects with varying column entries
new_projects_data = {
    'Budget': [1500],
    'Duration': [3],
    'Team_Size': [4],
    'Client_Feedback': ['good'],
    'Functional_Requirements': ['tribunal verification'],
    'Technologies_Used': ['Angular, Springboot, mongo']
}

# Create the DataFrame for new projects
new_projects = pd.DataFrame(new_projects_data)

# Concatenate text features for both old and new projects
all_text = df['Functional_Requirements'] + ", " + df['Technologies_Used']
new_text = new_projects['Functional_Requirements'] + ", " + new_projects['Technologies_Used']

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute cosine similarity between past and new projects
sparse_matrix_past = vectorizer.fit_transform(all_text)
sparse_matrix_new = vectorizer.transform(new_text)
similarity_matrix = cosine_similarity(sparse_matrix_new, sparse_matrix_past)

# Transpose the similarity matrix
similarity_matrix = similarity_matrix.T

# Add a new column for similarity scores to the DataFrame df
df['Similarity_Score'] = similarity_matrix.squeeze()

# Initialize SVM model
svm = SVR(kernel='linear')

# Train SVM model
svm.fit(sparse_matrix_past, df['Similarity_Score'])

# Predict similarity scores for new projects
similarity_scores = svm.predict(sparse_matrix_new)

# Add similarity scores to new projects DataFrame
new_projects['Predicted_Similarity_Score'] = similarity_scores

# Exclude the new project itself from the recommendation list
recommendations = df[df.index != df.index[-1]]

## Sort recommendations by similarity score
recommendations_sorted = recommendations.sort_values(by='Similarity_Score', ascending=False)

# Print top recommendations for new projects
print("Top Recommendations for New Projects:")
print(tabulate(recommendations_sorted.head(), headers='keys', showindex=False))

## TESTING METRICS
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sparse_matrix_past, df['Similarity_Score'], test_size=0.2, random_state=42)

# Initialize SVM model
svm = SVR(kernel='linear')

# Train SVM model
svm.fit(X_train, y_train)

# Predict similarity scores for testing data
y_pred = svm.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("Coefficient of Determination (R-squared):", r_squared)


import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tabulate import tabulate
import numpy as np
import pickle

# Read data for old projects from Excel file
df = pd.read_excel('Classeur2.xlsx')

# Define the data for new projects with varying column entries
new_projects_data = {
    'Budget': [4000],
    'Duration': [3],
    'Team_Size': [4],
    'Client_Feedback': ['good'],
    'Functional_Requirements': ['tribunal verification'],
    'Technologies_Used': ['Mongo, HTML, CSS']
}

# Create the DataFrame for new projects
new_projects = pd.DataFrame(new_projects_data)

# Concatenate text features for both old and new projects
all_text = df['Functional_Requirements'] + ", " + df['Technologies_Used']
new_text = new_projects['Functional_Requirements'] + ", " + new_projects['Technologies_Used']

# Initialize TF-IDF vectorizer with stop words removal
vectorizer = TfidfVectorizer(stop_words='english')

# Compute TF-IDF matrices
sparse_matrix_past = vectorizer.fit_transform(all_text)
sparse_matrix_new = vectorizer.transform(new_text)

# exporting vectorizer to use on flask app
pickle.dump(vectorizer, open("vectorizerr.sav", "wb"))

# Compute cosine similarity between past and new projects
similarity_matrix = cosine_similarity(sparse_matrix_new, sparse_matrix_past)

# Transpose the similarity matrix
similarity_matrix = similarity_matrix.T

# Add a new column for similarity scores to the DataFrame df
df['Similarity_Score'] = similarity_matrix.squeeze()
df.to_csv(r"Classeur_Sim.csv", index=False)

Top Recommendations for New Projects:
  Project_ID   Project_Name                                                                                     Budget    Duration    Team_Size   Client_Feedback      Success   Functional_Requirements                    Technologies_Used              Similarity_Score
-------------  ---------------------------------------------------------------------------------------------  --------  ----------  ------------  -----------------  ----------  -----------------------------------------  ---------------------------  ------------------
          148  Création d'une application de gestion de tâches                                                   22000          15             5  Bon                         1  Suivi des tâches, Rappels                  Angular, Firebase                      0.47751
           61  Mise en place d'un système de gestion des stocks pour une entreprise de commerce électronique     35000          18            10  Excellent    



In [8]:
df

Unnamed: 0,Project_ID,Project_Name,Budget,Duration,Team_Size,Client_Feedback,Success,Functional_Requirements,Technologies_Used,Similarity_Score
0,1,Website Redesign,1000,5,5,Excellent,1,"Improve user interface, Add e-commerce functio...","HTML, CSS, JavaScript, PHP",0.274223
1,2,Mobile App Launch,2000,2,6,good,0,"User authentication, In-app purchases","React Native, Firebase",0.000000
2,3,Marketing Campaign,25000,10,10,Excellent,1,"Social media integration, Email marketing auto...","MailChimp, Facebook Ads Manager",0.000000
3,4,Product Development,20000,24,12,Excellent,1,"Prototype development, Feature prioritization","Python, Django, PostgreSQL",0.000000
4,5,Consulting Project,30000,6,4,good,1,"Needs analysis, Solution design","Microsoft Excel, PowerPoint",0.000000
...,...,...,...,...,...,...,...,...,...,...
291,297,Application de gestion de bibliothèque en ligne,28000,14,6,Bon,1,"Gestion des emprunts, Catalogue en ligne","React, Django, PostgreSQL",0.000000
292,298,Plateforme de suivi de la santé mentale,32000,16,8,Excellent,1,"Suivi des humeurs, Ressources de bien-être","Vue.js, Flask, MongoDB",0.000000
293,299,Application de gestion de salles de sport,30000,15,7,Bon,1,"Gestion des adhérents, Planification des séances","Angular, ASP.NET, SQL Server",0.000000
294,300,Plateforme de covoiturage urbain,35000,18,9,Excellent,1,"Partage de trajets, Gestion des paiements","React Native, Node.js, MongoDB",0.000000


In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sparse_matrix_past, df['Similarity_Score'], test_size=0.2, random_state=42)

# Initialize models
svm = SVR(kernel='linear')
rf = RandomForestRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

# Set up parameter grids for hyperparameter tuning
param_grid_svm = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1]
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

param_grid_gbr = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='r2')
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='r2')
grid_search_gbr = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='r2')

# Train models
grid_search_svm.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_gbr.fit(X_train, y_train)

# Evaluate models using cross-validation
cv_scores_svm = cross_val_score(grid_search_svm.best_estimator_, X_train, y_train, cv=5)
cv_scores_rf = cross_val_score(grid_search_rf.best_estimator_, X_train, y_train, cv=5)
cv_scores_gbr = cross_val_score(grid_search_gbr.best_estimator_, X_train, y_train, cv=5)

print("SVM Cross-Validation Scores:", cv_scores_svm)
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Gradient Boosting Cross-Validation Scores:", cv_scores_gbr)

# Predict similarity scores for new projects
similarity_scores_svm = grid_search_svm.best_estimator_.predict(sparse_matrix_new)
similarity_scores_rf = grid_search_rf.best_estimator_.predict(sparse_matrix_new)
similarity_scores_gbr = grid_search_gbr.best_estimator_.predict(sparse_matrix_new)

# Add similarity scores to new projects DataFrame
new_projects['Predicted_Similarity_Score_SVM'] = similarity_scores_svm
new_projects['Predicted_Similarity_Score_RF'] = similarity_scores_rf
new_projects['Predicted_Similarity_Score_GBR'] = similarity_scores_gbr

# Evaluate model on test data
y_pred_svm = grid_search_svm.best_estimator_.predict(X_test)
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)
y_pred_gbr = grid_search_gbr.best_estimator_.predict(X_test)

# Calculate evaluation metrics for each model
def print_metrics(y_test, y_pred, model_name):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r_squared = r2_score(y_test, y_pred)
    print(f"{model_name} Metrics:")
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("Mean Absolute Error (MAE):", mae)
    print("Coefficient of Determination (R-squared):", r_squared)
    print("\n")

print_metrics(y_test, y_pred_svm, "SVM")
print_metrics(y_test, y_pred_rf, "Random Forest")
print_metrics(y_test, y_pred_gbr, "Gradient Boosting")

# Select best model based on R-squared score
best_model_name = max(
    ("SVM", r2_score(y_test, y_pred_svm)),
    ("Random Forest", r2_score(y_test, y_pred_rf)),
    ("Gradient Boosting", r2_score(y_test, y_pred_gbr)),
    key=lambda x: x[1]
)[0]

print(f"The best model is {best_model_name}")

# Use the best model for recommendations
if best_model_name == "SVM":
    best_model = grid_search_svm.best_estimator_
elif best_model_name == "Random Forest":
    best_model = grid_search_rf.best_estimator_
else:
    best_model = grid_search_gbr.best_estimator_


import pickle
pickle.dump(best_model, open("best_model_recommendation.sav", "wb"))

# Predict similarity scores for new projects using the best model
similarity_scores_best = best_model.predict(sparse_matrix_new)
new_projects['Predicted_Similarity_Score'] = similarity_scores_best

# Add similarity scores to the original DataFrame for sorting
df['Predicted_Similarity_Score'] = best_model.predict(sparse_matrix_past)

# Exclude the new project itself from the recommendation list
recommendations = df[df.index != df.index[-1]]

# Sort recommendations by predicted similarity score
recommendations_sorted = recommendations.sort_values(by='Predicted_Similarity_Score', ascending=False)

# Print top recommendations for new projects
print("Top Recommendations for New Projects:")
print(tabulate(recommendations_sorted.head(), headers='keys', showindex=False))