1. Carregamento do dataset

In [76]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [77]:
# Caminho do ficheiro
file_path = r'C:\Users\marianas\OneDrive - Capgemini\Documents\Projeto R&D\ID Cards\November 2025\ID_8.11.1\data\qws2.txt'

# Ler todas as linhas, ignorando comentários
with open(file_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if not line.startswith('#') and line.strip()]

# Processar cada linha: separar por vírgula e corrigir colunas extra
cleaned_data = []
for line in lines:
    parts = line.split(',')
    if len(parts) > 11:
        # Junta colunas extra no último campo (WSDL Address)
        parts = parts[:10] + [','.join(parts[10:])]
    cleaned_data.append(parts)

# Criar DataFrame com nomes de colunas
columns = [
    'ResponseTime', 'Availability', 'Throughput', 'Successability', 'Reliability',
    'Compliance', 'BestPractices', 'Latency', 'Documentation', 'ServiceName', 'WSDLAddress'
]

df = pd.DataFrame(cleaned_data, columns=columns)

# Mostrar primeiras linhas e número total de linhas
print(df.head())
print("\nNúmero total de linhas:", len(df))



  ResponseTime Availability Throughput Successability Reliability Compliance  \
0       302.75           89        7.1             90          73         78   
1          482           85         16             95          73        100   
2       3321.4           89        1.4             96          73         78   
3       126.17           98         12            100          67         78   
4          107           87        1.9             95          73         89   

  BestPractices Latency Documentation        ServiceName  \
0            80  187.75            32       MAPPMatching   
1            84       1             2          Compound2   
2            80     2.6            96           USDAData   
3            82   22.77            89  GBNIRHolidayDates   
4            62   58.33            93           CasUsers   

                                         WSDLAddress  
0  http://xml.assessment.com/service/MAPPMatching...  
1  http://www.mssoapinterop.org/asmx/WSDL/compou

In [78]:

# Quick overview
print(df.shape)
print(df.info())
print(df.describe())
print(df.isnull().sum())

(2507, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2507 entries, 0 to 2506
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ResponseTime    2507 non-null   object
 1   Availability    2507 non-null   object
 2   Throughput      2507 non-null   object
 3   Successability  2507 non-null   object
 4   Reliability     2507 non-null   object
 5   Compliance      2507 non-null   object
 6   BestPractices   2507 non-null   object
 7   Latency         2507 non-null   object
 8   Documentation   2507 non-null   object
 9   ServiceName     2507 non-null   object
 10  WSDLAddress     2507 non-null   object
dtypes: object(11)
memory usage: 215.6+ KB
None
       ResponseTime Availability Throughput Successability Reliability  \
count          2507         2507       2507           2507        2507   
unique         1905           90        322             92          13   
top             115           83        

In [79]:
#convertendo colunas numéricas (são inicialmente lidas como strings)
for col in df.columns[:-2]:  # exceto ServiceName e WSDLAddress
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [80]:

# Imputação numérica
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

if len(numeric_cols) > 0:
    df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
else:
    print("Nenhuma coluna numérica encontrada.")

# Imputação categórica
categorical_cols = ['ServiceName', 'WSDLAddress']  # variáveis categóricas
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [81]:
df.head()

Unnamed: 0,ResponseTime,Availability,Throughput,Successability,Reliability,Compliance,BestPractices,Latency,Documentation,ServiceName,WSDLAddress
0,302.75,89,7.1,90,73,78,80,187.75,32,MAPPMatching,http://xml.assessment.com/service/MAPPMatching...
1,482.0,85,16.0,95,73,100,84,1.0,2,Compound2,http://www.mssoapinterop.org/asmx/WSDL/compoun...
2,3321.4,89,1.4,96,73,78,80,2.6,96,USDAData,http://www.strikeiron.com/webservices/usdadata...
3,126.17,98,12.0,100,67,78,82,22.77,89,GBNIRHolidayDates,http://www.holidaywebservice.com/Holidays/GBNI...
4,107.0,87,1.9,95,73,89,62,58.33,93,CasUsers,http://galex.stsci.edu/casjobs/CasUsers.asmx?WSDL


In [82]:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    print(f"Coluna: {col}")
    print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
    print("-" * 40)

    # Se quiser aplicar o filtro depois:
    # df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]



Coluna: ResponseTime
Q1: 142.33499999999998, Q3: 348.66499999999996, IQR: 206.32999999999998
----------------------------------------
Coluna: Availability
Q1: 75.0, Q3: 93.0, IQR: 18.0
----------------------------------------
Coluna: Throughput
Q1: 2.8, Q3: 13.3, IQR: 10.5
----------------------------------------
Coluna: Successability
Q1: 76.0, Q3: 98.0, IQR: 22.0
----------------------------------------
Coluna: Reliability
Q1: 67.0, Q3: 73.0, IQR: 6.0
----------------------------------------
Coluna: Compliance
Q1: 78.0, Q3: 100.0, IQR: 22.0
----------------------------------------
Coluna: BestPractices
Q1: 75.0, Q3: 84.0, IQR: 9.0
----------------------------------------
Coluna: Latency
Q1: 4.6, Q3: 44.68, IQR: 40.08
----------------------------------------
Coluna: Documentation
Q1: 6.0, Q3: 42.0, IQR: 36.0
----------------------------------------


In [83]:
df_encoded = df.copy()

# padroniza os dados numéricos (média = 0 e desvio padrão = 1)
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

#útil para algoritmos sensíveis à escala (regressão, clustering, redes neurais)


In [84]:
print(df_encoded)

      ResponseTime  Availability  Throughput  Successability  Reliability  \
0        -0.143703      0.420077   -0.250417        0.307201     0.375176   
1         0.173976      0.206145    0.901052        0.558474     0.375176   
2         5.206152      0.420077   -0.987875        0.608728     0.375176   
3        -0.456650      0.901424    0.383537        0.809746    -0.324650   
4        -0.490624      0.313111   -0.923186        0.558474     0.375176   
...            ...           ...         ...             ...          ...   
2502     -0.324385      0.634009   -0.858496        0.709237     0.375176   
2503     -0.580708      0.847941    0.292972        0.658983     1.541552   
2504     -0.513787     -0.061270   -0.897310       -0.195345    -0.324650   
2505     -0.491067      0.259628   -1.000813        0.558474     1.191640   
2506     -0.119334     -0.489134    0.888114       -0.597382     1.191640   

      Compliance  BestPractices   Latency  Documentation        ServiceName

Objetivo do projeto: prever WebService: Response Time, Latency, Availability (classificação)
- Prever desempenho do serviço → usar ResponseTime ou Availability
- Classificar serviços → Availability.

In [85]:
df.columns

Index(['ResponseTime', 'Availability', 'Throughput', 'Successability',
       'Reliability', 'Compliance', 'BestPractices', 'Latency',
       'Documentation', 'ServiceName', 'WSDLAddress'],
      dtype='object')

In [86]:

X = df.drop('ResponseTime', axis=1)
y = df['ResponseTime']

numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = ['ServiceName', 'WSDLAddress']  # se existirem no X

print("Colunas numéricas:", numeric_cols.tolist())
print("Colunas categóricas:", categorical_cols)


Colunas numéricas: ['Availability', 'Throughput', 'Successability', 'Reliability', 'Compliance', 'BestPractices', 'Latency', 'Documentation']
Colunas categóricas: ['ServiceName', 'WSDLAddress']


In [87]:
# Dividir em treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transformadores
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Pipeline com modelo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Treinar
model.fit(X_train, y_train)

# Avaliar
score = model.score(X_test, y_test)
print(f"R² no conjunto de teste: {score:.4f}")


R² no conjunto de teste: 0.1272
