In [7]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

In [8]:
train = pd.read_csv('salary_train.csv')
test = pd.read_csv('salary_test.csv')
train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,"As a result of continued growth, First Class S...",Whitley Bay,contract,26400
59996,PHP / MVC Web Developer MacclesfieldCirca ***...,Macclesfield,permanent,26000
59997,"Staff Nurse, Nursing Home, Baldock White Recru...",Baldock,,24500
59998,This is one of the best agency side opportunit...,The City,permanent,65000


In [9]:
test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [10]:
train['FullDescription'] = train.FullDescription.str.lower()
train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager// luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,"as a result of continued growth, first class s...",Whitley Bay,contract,26400
59996,php / mvc web developer macclesfieldcirca ***...,Macclesfield,permanent,26000
59997,"staff nurse, nursing home, baldock white recru...",Baldock,,24500
59998,this is one of the best agency side opportunit...,The City,permanent,65000


In [11]:
train['FullDescription'] = train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex=True)
train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,as a result of continued growth first class s...,Whitley Bay,contract,26400
59996,php mvc web developer macclesfieldcirca ...,Macclesfield,permanent,26000
59997,staff nurse nursing home baldock white recru...,Baldock,,24500
59998,this is one of the best agency side opportunit...,The City,permanent,65000


In [21]:
vectorizer = TfidfVectorizer()
vectorizer.min_df = 5
x_train1 = vectorizer.fit_transform(train['FullDescription'])
x_train1

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [22]:
test['FullDescription'] = test.FullDescription.str.lower()
test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex=True)
x_test1 = vectorizer.transform(test['FullDescription'])
x_test1

<2x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Row format>

In [19]:
train['LocationNormalized'].fillna('nan', inplace=True)
train['ContractTime'].fillna('nan', inplace=True)
train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,as a result of continued growth first class s...,Whitley Bay,contract,26400
59996,php mvc web developer macclesfieldcirca ...,Macclesfield,permanent,26000
59997,staff nurse nursing home baldock white recru...,Baldock,,24500
59998,this is one of the best agency side opportunit...,The City,permanent,65000


In [27]:
enc = DictVectorizer()
x_train2 = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
x_test2 = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [29]:
from scipy.sparse import hstack

In [31]:
x_train = hstack((x_train1, x_train2))
x_train

<60000x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 8485759 stored elements in COOrdinate format>

In [32]:
x_test = hstack((x_test1, x_test2))
x_test

<2x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 304 stored elements in COOrdinate format>

In [33]:
y = train['SalaryNormalized']
y

0        33000
1        50000
2        40000
3        22500
4        20355
         ...  
59995    26400
59996    26000
59997    24500
59998    65000
59999    23040
Name: SalaryNormalized, Length: 60000, dtype: int64

In [34]:
model = Ridge(alpha=1, random_state=241)
model.fit(x_train, y)
model.predict(x_test)

array([56555.61500155, 37188.32442618])