In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.linear_model import Ridge


In [2]:
train = pd.read_csv("./salary-train.zip")
test = pd.read_csv("./salary-test-mini.csv")


In [3]:
train.head()


Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [4]:
test.head()


Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [5]:
train = train.fillna("nan")
test = test.fillna("nan")

for column in train.columns[:-1]:
    train[column] = train[column].apply(str.lower)
    test[column] = test[column].apply(str.lower)

    train[column] = train[column].replace("[^a-zA-Z0-9]", " ", regex=True)
    test[column] = test[column].replace("[^a-zA-Z0-9]", " ", regex=True)


In [6]:
train.head()


Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [7]:
tfidf_enc = TfidfVectorizer(min_df=5)
X_train_tfidf = tfidf_enc.fit_transform(train["FullDescription"])
X_test_tfidf = tfidf_enc.transform(test["FullDescription"])


In [8]:
enc = DictVectorizer()
X_train_cat = enc.fit_transform(
    train[["LocationNormalized", "ContractTime"]].to_dict("records")
)
X_test_cat = enc.transform(
    test[["LocationNormalized", "ContractTime"]].to_dict("records")
)


In [9]:
X_train = sparse.hstack((X_train_tfidf, X_train_cat))
X_test = sparse.hstack((X_test_tfidf, X_test_cat))


In [10]:
ridge = Ridge(random_state=241)


In [11]:
ridge.fit(X_train, train["SalaryNormalized"])


Ridge(random_state=241)

In [12]:
answer = ridge.predict(X_test)


In [13]:
answer = [round(item, 2) for item in answer]


In [14]:
answer


[56567.63, 37142.4]