#

# **Website Recommendation System** 

### Import neccessary libraries

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import joblib

# # Data collection

In [2]:
# Load the dataset
file_path = r'dataset\qws2.csv'
df = pd.read_csv(file_path)

#### Analys the data

In [3]:
df

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,Service Name,WSDL Address,Unnamed: 11
0,302.75,89,7.1,90,73,78,80,187.75,32,MAPPMatching,http://xml.assessment.com/service/MAPPMatching...,
1,482.00,85,16.0,95,73,100,84,1.00,2,Compound2,http://www.mssoapinterop.org/asmx/WSDL/compoun...,
2,3321.40,89,1.4,96,73,78,80,2.60,96,USDAData,http://www.strikeiron.com/webservices/usdadata...,
3,126.17,98,12.0,100,67,78,82,22.77,89,GBNIRHolidayDates,http://www.holidaywebservice.com/Holidays/GBNI...,
4,107.00,87,1.9,95,73,89,62,58.33,93,CasUsers,http://galex.stsci.edu/casjobs/CasUsers.asmx?WSDL,
...,...,...,...,...,...,...,...,...,...,...,...,...
2498,200.80,93,2.4,98,73,100,84,7.40,41,garnierService,http://genome.dkfz-heidelberg.de/menu/hobit/em...,
2499,56.17,97,11.3,97,83,78,91,7.17,3,AWSAlexa,http://awis.amazonaws.com/AWSAlexa/AWSAlexa.wsdl,
2500,93.93,80,2.1,80,67,78,82,3.72,60,interop2,http://www.cs.fsu.edu/~engelen/interop2.wsdl,
2501,106.75,86,1.3,95,80,78,87,1.25,96,SailboatCalcsWS,http://pooh.poly.asu.edu/cst556-sailboatcalcsw...,


In [4]:
# Count rows and columns
df.shape

(2503, 12)

In [5]:
# columns names
df.columns

Index(['Response Time', ' Availability', ' Throughput', ' Successability',
       ' Reliability', ' Compliance', ' Best Practices', ' Latency',
       ' Documentation', ' Service Name', ' WSDL Address', 'Unnamed: 11'],
      dtype='object')

## Dataset Description

### QWS dataset - Quality Web Service 

| ID  | Parameter Name       | Description                                                    | Units              |
| --- | -------------------- | -------------------------------------------------------------- | ------------------ |
| 1   | Response Time        | Time taken to send a request and receive a response             | ms                 |
| 2   | Availability         | Number of successful invocations/total invocations              | %                  |
| 3   | Throughput           | Total Number of invocations for a given period of time          | invokes/second     |
| 4   | Successability        | Number of response / number of request messages                | %                  |
| 5   | Reliability          | Ratio of the number of error messages to total messages         | %                  |
| 6   | Compliance           | The extent to which a WSDL document follows WSDL specification  | %                  |
| 7   | Best Practices        | The extent to which a Web service follows WS-I Basic Profile    | %                  |
| 8   | Latency              | Time taken for the server to process a given request            | ms                 |
| 9   | Documentation        | Measure of documentation (i.e. description tags) in WSDL        | %                  |
| 10  | WsRF                 | Web Service Relevancy Function: a rank for Web Service Quality  | %                  |
| 11  | Service Classification | Levels representing service offering qualities (1 through 4)   | Classifier         |
| 12  | Service Name         | Name of the Web service                                         | -                  |
| 13  | WSDL Address         | Location of the Web Service Definition Language (WSDL) file on the Web | -              |


In [6]:
# Read first 5 rows
df.head()

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,Service Name,WSDL Address,Unnamed: 11
0,302.75,89,7.1,90,73,78,80,187.75,32,MAPPMatching,http://xml.assessment.com/service/MAPPMatching...,
1,482.0,85,16.0,95,73,100,84,1.0,2,Compound2,http://www.mssoapinterop.org/asmx/WSDL/compoun...,
2,3321.4,89,1.4,96,73,78,80,2.6,96,USDAData,http://www.strikeiron.com/webservices/usdadata...,
3,126.17,98,12.0,100,67,78,82,22.77,89,GBNIRHolidayDates,http://www.holidaywebservice.com/Holidays/GBNI...,
4,107.0,87,1.9,95,73,89,62,58.33,93,CasUsers,http://galex.stsci.edu/casjobs/CasUsers.asmx?WSDL,


In [7]:
# Read last 5 rows
df.tail()

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,Service Name,WSDL Address,Unnamed: 11
2498,200.8,93,2.4,98,73,100,84,7.4,41,garnierService,http://genome.dkfz-heidelberg.de/menu/hobit/em...,
2499,56.17,97,11.3,97,83,78,91,7.17,3,AWSAlexa,http://awis.amazonaws.com/AWSAlexa/AWSAlexa.wsdl,
2500,93.93,80,2.1,80,67,78,82,3.72,60,interop2,http://www.cs.fsu.edu/~engelen/interop2.wsdl,
2501,106.75,86,1.3,95,80,78,87,1.25,96,SailboatCalcsWS,http://pooh.poly.asu.edu/cst556-sailboatcalcsw...,
2502,316.5,72,15.9,72,80,89,87,5.5,8,Slash/SOAP/Test,http://use.perl.org/soap/Test.wsdl,


In [8]:
# Information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2503 entries, 0 to 2502
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Response Time    2503 non-null   float64
 1    Availability    2503 non-null   int64  
 2    Throughput      2503 non-null   float64
 3    Successability  2503 non-null   int64  
 4    Reliability     2503 non-null   int64  
 5    Compliance      2503 non-null   int64  
 6    Best Practices  2503 non-null   int64  
 7    Latency         2503 non-null   float64
 8    Documentation   2503 non-null   int64  
 9    Service Name    2503 non-null   object 
 10   WSDL Address    2503 non-null   object 
 11  Unnamed: 11      2 non-null      object 
dtypes: float64(3), int64(6), object(3)
memory usage: 234.8+ KB


In [9]:
# Information about the dataset
df.describe()

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation
count,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0
mean,383.632129,81.141031,9.035757,83.881742,69.777068,88.445865,79.296844,54.680995,31.279265
std,564.548568,18.701398,7.734667,19.903213,8.576118,10.023243,7.816809,191.857554,31.485635
min,37.0,7.0,0.1,8.0,33.0,33.0,50.0,0.25,1.0
25%,142.335,75.0,2.8,76.0,67.0,78.0,75.0,4.58,6.0
50%,226.6,87.0,6.9,95.0,73.0,89.0,82.0,11.83,11.0
75%,348.215,93.0,13.3,98.0,73.0,100.0,84.0,44.68,42.0
max,4989.67,100.0,43.1,100.0,89.0,100.0,95.0,4140.35,97.0


In [10]:
# Find duplicate rows
print(df.duplicated().sum())
duplicates = df[df.duplicated(keep=False)]
print(duplicates)

0
Empty DataFrame
Columns: [Response Time,  Availability,  Throughput,  Successability,  Reliability,  Compliance,  Best Practices,  Latency,  Documentation,  Service Name,  WSDL Address, Unnamed: 11]
Index: []


In [11]:
# Calculate empty values for all columns
df.isnull().sum()

Response Time         0
 Availability         0
 Throughput           0
 Successability       0
 Reliability          0
 Compliance           0
 Best Practices       0
 Latency              0
 Documentation        0
 Service Name         0
 WSDL Address         0
Unnamed: 11        2501
dtype: int64

In [12]:
df["Unnamed: 11"].value_counts()

Unnamed: 11
r=1.2/jax-wsa-sources/jaxwsa-ri/test/src/w3c/s11/etc/wsaTestService.wsdl    1
r=1.3/jax-wsa-sources/jaxwsa-ri/test/src/w3c/s12/etc/wsaTestService.wsdl    1
Name: count, dtype: int64

In [13]:
df[" Service Name"].value_counts()

 Service Name
AnalysisWSAppLabImplService    102
GoogleSearchService             43
Service                         28
AWSECommerceService             25
Service1                        17
                              ... 
ReportReadingWS                  1
TaxonomyManager                  1
RandomPlayer                     1
User                             1
Slash/SOAP/Test                  1
Name: count, Length: 1639, dtype: int64

In [14]:
# Display the number of unique values in the "WSDL Address" column
df[" WSDL Address"].nunique()

2379

# # Data Preprocess

In [15]:
df.drop("Unnamed: 11", axis=1, inplace=True)

In [16]:
df.columns

Index(['Response Time', ' Availability', ' Throughput', ' Successability',
       ' Reliability', ' Compliance', ' Best Practices', ' Latency',
       ' Documentation', ' Service Name', ' WSDL Address'],
      dtype='object')

Rename columns beacause some columns have spaces(" ") in it's name. It'll make confusion will deploying in frontend. So we are going to Rename those columns


In [17]:
# Rename columns
df = df.rename(columns={
    ' Availability': 'Availability',
    ' Throughput': 'Throughput',
    ' Successability': 'Successability',
    ' Reliability': 'Reliability',
    ' Compliance': 'Compliance',
    ' Best Practices': 'Best Practices',
    ' Latency': 'Latency',
    ' Documentation': 'Documentation',
    ' Service Name': 'Service Name',
    ' WSDL Address': 'WSDL Address'
})


In [18]:
df.columns

Index(['Response Time', 'Availability', 'Throughput', 'Successability',
       'Reliability', 'Compliance', 'Best Practices', 'Latency',
       'Documentation', 'Service Name', 'WSDL Address'],
      dtype='object')

In [19]:
# dataframe
df

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,Service Name,WSDL Address
0,302.75,89,7.1,90,73,78,80,187.75,32,MAPPMatching,http://xml.assessment.com/service/MAPPMatching...
1,482.00,85,16.0,95,73,100,84,1.00,2,Compound2,http://www.mssoapinterop.org/asmx/WSDL/compoun...
2,3321.40,89,1.4,96,73,78,80,2.60,96,USDAData,http://www.strikeiron.com/webservices/usdadata...
3,126.17,98,12.0,100,67,78,82,22.77,89,GBNIRHolidayDates,http://www.holidaywebservice.com/Holidays/GBNI...
4,107.00,87,1.9,95,73,89,62,58.33,93,CasUsers,http://galex.stsci.edu/casjobs/CasUsers.asmx?WSDL
...,...,...,...,...,...,...,...,...,...,...,...
2498,200.80,93,2.4,98,73,100,84,7.40,41,garnierService,http://genome.dkfz-heidelberg.de/menu/hobit/em...
2499,56.17,97,11.3,97,83,78,91,7.17,3,AWSAlexa,http://awis.amazonaws.com/AWSAlexa/AWSAlexa.wsdl
2500,93.93,80,2.1,80,67,78,82,3.72,60,interop2,http://www.cs.fsu.edu/~engelen/interop2.wsdl
2501,106.75,86,1.3,95,80,78,87,1.25,96,SailboatCalcsWS,http://pooh.poly.asu.edu/cst556-sailboatcalcsw...


In [20]:
# Save the specified columns of the dataframe as a CSV file
df[['Response Time', 'Availability', 'Throughput', 'Successability', 'Reliability', 'Compliance', 'Best Practices', 'Latency', 'Documentation', 'WSDL Address']].to_csv("dataset/Final_data.csv")

### Label encoding the object columns

In [21]:
# Store object column names
original_columns = df.select_dtypes(include='object').columns

# Initialize LabelEncoder
label_encoders = {}

# Apply LabelEncoder to each categorical variable
for col in original_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Print the mapping between original categories and numerical labels
for col, encoder in label_encoders.items():
    print(f"Mapping for column '{col}':")
    for label, category in enumerate(encoder.classes_):
        print(f"Label {label}: {category}")
    print("=====================")

Mapping for column 'Service Name':
Label 0: AAIService
Label 1: ABAService
Label 2: ABRXMLPubSub
Label 3: ABRXMLSearch
Label 4: ACCLIPServiceService
Label 5: ACMEProducts
Label 6: ADQLTranslator
Label 7: AIMSAddressWebService
Label 8: AMIWebServiceService
Label 9: AOSIFWSService
Label 10: ARSA
Label 11: ARTSPDF_Online_Collaboration
Label 12: ASPFAQs
Label 13: ASPNLexer
Label 14: ASPNLogin
Label 15: ASPNNewsFeeds
Label 16: ATMService
Label 17: ATTSMS
Label 18: AWSAlexa
Label 19: AWSECommerceService
Label 20: AWSMechanicalTurkRequester
Label 21: AccelerationUnit
Label 22: AccountEventHandler
Label 23: AccountInfo
Label 24: AccountManagement
Label 25: AccountService
Label 26: AccountServiceNoHeader
Label 27: AccountServices
Label 28: AddressBookService
Label 29: AddressFinder
Label 30: AddressLookup
Label 31: AddressManager
Label 32: AddressVerifier
Label 33: Admin
Label 34: AdminService
Label 35: Advanced
Label 36: AffiliateService
Label 37: Afstandsberegner
Label 38: AircraftData
Label 

In [22]:
# dataframe
df

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,Service Name,WSDL Address
0,302.75,89,7.1,90,73,78,80,187.75,32,609,2358
1,482.00,85,16.0,95,73,100,84,1.00,2,181,1864
2,3321.40,89,1.4,96,73,78,80,2.60,96,1087,2112
3,126.17,98,12.0,100,67,78,82,22.77,89,389,1733
4,107.00,87,1.9,95,73,89,62,58.33,93,154,242
...,...,...,...,...,...,...,...,...,...,...,...
2498,200.80,93,2.4,98,73,100,84,7.40,41,1391,295
2499,56.17,97,11.3,97,83,78,91,7.17,3,18,75
2500,93.93,80,2.1,80,67,78,82,3.72,60,1420,1290
2501,106.75,86,1.3,95,80,78,87,1.25,96,891,637


#### Build Recommendation system model

In [23]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import joblib

# Features for the recommendation
features = ['Response Time', 'Availability', 'Throughput', 'Successability', 
            'Reliability', 'Compliance', 'Best Practices', 'Latency', 'Documentation']

# Step 1: Normalize features
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# Step 2: Train k-Nearest Neighbors model
knn_model = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn_model.fit(df[features])

# Step 3: Save both the model and the scaler
joblib.dump(knn_model, r'models\knn_recommender_model.joblib')
joblib.dump(scaler, r'modelsscaler.joblib')

['modelsscaler.joblib']

In [24]:
import joblib
import numpy as np
import pandas as pd

# Load the trained model and scaler and dataset
knn_model = joblib.load(r'models\knn_recommender_model.joblib')
scaler = joblib.load(r'models\scaler.joblib')
data = pd.read_csv(r"dataset\Final_data.csv")

def recommendations(input):
    # Step 1: Scale the user input
    user_input_scaled = scaler.transform([input])

    # Step 2: Get the 10 nearest neighbors
    distances, indices = knn_model.kneighbors(user_input_scaled)
    print("distances:", distances)
    print("indices:", indices)

    # Step 3: Retrieve the recommended WSDL Addresses
    for idx in indices[0]:  # Loop over the first row of the indices array
        url = data.iloc[idx]['WSDL Address']  # Get the WSDL Address for each index
        print(idx, " => ", url)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [25]:
# example input (index 0)
recommendations([302.75, 89, 7.1, 90, 73, 78, 80, 187.75, 32])

distances: [[0.         0.12303584 0.12327288 0.12436351 0.13526677 0.14221896
  0.14610035 0.14718515 0.15256321 0.15333333]]
indices: [[   0 1968  429  698 1318 2408 2336 1165 1145 1778]]
0  =>  http://xml.assessment.com/service/MAPPMatching.asmx?wsdl
1968  =>  http://www.xmlme.com/WSDailyXml.asmx?wsdl
429  =>  http://ws.cdyne.com/ziptogeo/zip2geo.asmx?wsdl
698  =>  http://xml.assessment.com/service/MAPPResults.asmx?wsdl
1318  =>  http://www.xmlme.com/WSDailyNet.asmx?wsdl
2408  =>  http://scores.serviceobjects.com/CupScores.asmx?WSDL
2336  =>  http://www.xmlme.com/WSSportingGoods.asmx?WSDL
1165  =>  http://www.webservicex.com/globalweather.asmx?WSDL
1145  =>  http://www.webservicex.net/UDDIBusinessFinder.asmx?WSDL
1778  =>  http://interlockingapplications.com/gs/wsDice.asmx?wsdl




In [26]:
# example input (index 2502)
recommendations([316.5, 72, 15.9, 72, 80, 89, 87, 5.5, 8])

distances: [[0.         0.06637653 0.07244216 0.07469639 0.08476595 0.08662802
  0.10154039 0.11371418 0.11584924 0.11809573]]
indices: [[2502  495  972  801  508 1277  822  420   79  478]]
2502  =>  http://use.perl.org/soap/Test.wsdl
495  =>  http://www.ebob42.com/cgi-bin/Romulan.exe/wsdl/IRoman
972  =>  http://oscar.snapps.com/portal.nsf/PlaceCatalog?WSDL
801  =>  http://developerdays.com/cgi-bin/tempconverter.exe/wsdl/ITempConverter?WSDL
508  =>  http://www.sandrini-a.com/ws/sandrini.wsdl
1277  =>  http://oscar.snapps.com/Portal.nsf/Echo?wsdl
822  =>  http://oscar.snapps.com/portal.nsf/MyService?WSDL
420  =>  http://email.secureserver.net/vsdbServer.php?wsdl
79  =>  http://sas-d.sermepa.es/TPV_PC/wsdl/SerClsEntradaTPVPCv3d0.wsdl
478  =>  http://tomcatbackup.esat.kuleuven.be/maran/Maran.wsdl




In [27]:
# example input (index 684)
recommendations([219, 87, 2.8, 96, 73, 78, 80, 17, 87, ])

distances: [[0.         0.06413985 0.07156172 0.08282155 0.08647759 0.08676694
  0.096541   0.09762119 0.0980833  0.10184511]]
indices: [[ 684   19 1730 2190 2251   91 1799 1692 1249 1227]]
684  =>  http://ws.keyfortravel.com/Hotel/hotelavail.asmx?wsdl
19  =>  http://trial.serviceobjects.com/fpl/FraudProtectionLite.asmx?WSDL
1730  =>  http://ws2.serviceobjects.net/pa/phoneappend.asmx?wsdl
2190  =>  http://www.gecip.net/SMSWS/sms.asmx?wsdl
2251  =>  http://ws.interfax.net/inbound.asmx?wsdl
91  =>  http://mackeybros.com/WebServices/UPSRates.asmx?wsdl
1799  =>  http://gisdata.usgs.net/XMLWebServices/USNG.asmx?WSDL
1692  =>  http://ws.keyfortravel.com/webservices/K4THotelSell.asmx?wsdl
1249  =>  http://www.fullerdata.com/FortuneCookie/FortuneCookie.asmx?WSDL
1227  =>  http://www.insuma.de/wsdl/demo.wsdl


